版主您好:
刚接触cuda编程,下面是自己试着用cuda来优化图像纹理特征提取(LBP)代码,有cpu和gpu两部分,显卡是GTX480, 可能因为自己对线程和块的分配不合理,速度不理想(低于cpu版本),于是试用NVVP来分析调试,提示(Results: no timeline Application timeline is required for the analysis )(注:本想上传截图,试过几次传不去),求指导。1:代码中应该在哪些方面再做优化,
2:多维线程和块的使用。(试过没成功)
3:帮忙分析下nvvp不能正常使用的原因,之前试过别的exe可行。
非常感谢!!
代码如下
// sample.cpp : 定义控制台应用程序的入口点。
#include “cv.h”
#include “highgui.h”
#include
#include<stdlib.h>
#include
using namespace std;
using namespace cv;
void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage); //cpu
global void getimageLBPhist2txtgpu(int *d_Matrix,int d_Matrix_lbp,int width,int height) //gpu
{
int cow = threadIdx.x;
int row = blockIdx.x;
if(threadIdx.x<width)
{
int center=0;
int center_lbp=0;
center =d_Matrix[rowwidth+cow]; //返回单通道数组的指定元素 块 3X3
if (center >= d_Matrix[(row-1)*width+cow-1])
{
center_lbp += 1;
}
if (center >= d_Matrix[(row-1)*width+cow])
{
center_lbp += 2;
}
if (center >= d_Matrix[(row-1)*width+cow+1])
{
center_lbp += 4;
}
if (center >= d_Matrix[row*width+cow-1])
{
center_lbp += 8;
}
if (center >= d_Matrix[row*width+cow+1])
{
center_lbp += 16;
}
if (center >= d_Matrix[(row+1)*width+cow-1])
{
center_lbp += 32;
}
if (center >=d_Matrix[(row+1)*width+cow])
{
center_lbp += 64;
}
if (center >= d_Matrix[(row+1)*width+cow+1])
{
center_lbp += 128;
}
d_Matrix_lbp[row*width+cow]=center_lbp;
}
return;
}
int main()
{
int use_gpu=1;//true;//;false
int width;
int height;
int lbp_bins = 10; //直方图12个bin 维度可以自由设置
IplImage* img=cvLoadImage(“aa.jpg”);
if(img==NULL)
{
cout<<“load image error”;
}
IplImage* m_cvImage = cvCreateImage(cvGetSize(img), 8, 1); //灰度图像
IplImage* m_cvLBPImage = cvCreateImage(cvGetSize(img), 8, 1);//LBP值图像
if (img->nChannels == 3)
{
cvCvtColor(img, m_cvImage, CV_BGR2GRAY);//色彩空间转换(灰色空间)
}
width=m_cvImage->width; //463
height=m_cvImage->height; //288
cvNamedWindow("cpu");
cvShowImage("cpu",m_cvImage);
cvWaitKey(9);
if(use_gpu)
{
ofstream fout(“gpu.txt”); //将结果输出到txt中
int* d_Matrix;
int* d_Matrix_lbp;
int h_Matrix=(int)malloc(heightwidthsizeof(int));
for(int row=0;row<height;row++)
for (int col=0;col<width;col++)
{
h_Matrix[row*width+col]=cvGetReal2D(m_cvImage, row, col);
}
cudaMalloc( &d_Matrix,width*height*sizeof(int) );
cudaMalloc( &d_Matrix_lbp,width*height*sizeof(int) );
cudaMemcpy(d_Matrix,h_Matrix,width*height*sizeof(int),cudaMemcpyHostToDevice);
getimageLBPhist2txtgpu<<<288,463>>>(d_Matrix,d_Matrix_lbp,width,height); //ceil(width*height/1024)~~130.2
cudaMemcpy(h_Matrix,d_Matrix_lbp,width*height*sizeof(int),cudaMemcpyDeviceToHost);
fout<<"gpu....."<<endl;;
for(int row=0;row<height;row++)
{
for (int col=0;col<width;col++)
{
fout<<"("<<row<<" "<<col<<")"<<h_Matrix[row*width+col]<<" ";
cvSetReal2D(m_cvLBPImage, row, col, h_Matrix[row*width+col]);
}
}
cudaFree(d_Matrix);
free(h_Matrix);
cudaDeviceReset();
}
else
{
getimageLBPhist2txt(m_cvImage ,m_cvLBPImage);
}
float lbp_ranges[] = { 0, 255 };
float* pb_ranges = lbp_ranges;
CvHistogram* hist_lbp = cvCreateHist( 1, &lbp_bins, CV_HIST_ARRAY, &pb_ranges, 1 );//生成直方图
cvCalcHist( &m_cvLBPImage, hist_lbp, 0, 0 ); //产生直方图
cvNormalizeHist(hist_lbp,1.0); //归一化
CvMat *histogram2=cvCreateMat( lbp_bins,1,CV_64FC1);
for(int i1=0 ; i1< histogram2->rows; i1++)
{
histogram2->data.fl[i1] = 0.0;
}
cout<<"lbp_value............."<<endl;
for(int i3 = 0; i3< histogram2->rows; i3++)
{
histogram2->data.fl[i3] = (float)cvGetReal1D(hist_lbp->bins, i3);
float lbp_value=histogram2->data.fl[i3];
cout<<lbp_value<<" ";
}
cout<<“…”<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseHist(&hist_lbp);
cout<<endl<<".........end.............."<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseImage(&img);
return 0;
}
//cpu 提取lbp特征
void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage )
{
ofstream fout(“cpu.txt”); //将结果输出到txt中
int center=0;
int center_lbp=0;
//计算每个点的LBP值
fout<<"cpu................"<<endl;
for (int row=1; row < m_cvImage->height-1; row++)
for (int col=1; col < m_cvImage->width-1; col++)
{
center = cvGetReal2D(m_cvImage, row, col); //返回单通道数组的指定元素 块 3X3
center_lbp = 0;
if (center >= cvGetReal2D(m_cvImage, row-1, col-1))
{
center_lbp += 1;
}
if (center >= cvGetReal2D(m_cvImage, row-1, col))
{
center_lbp += 2;
}
if (center >= cvGetReal2D(m_cvImage, row-1, col+1))
{
center_lbp += 4;
}
if (center >= cvGetReal2D(m_cvImage, row, col-1))
{
center_lbp += 8;
}
if (center >= cvGetReal2D(m_cvImage, row, col+1))
{
center_lbp += 16;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col-1))
{
center_lbp += 32;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col))
{
center_lbp += 64;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col+1))
{
center_lbp += 128;
}
cvSetReal2D(m_cvLBPImage, row, col, center_lbp); //将中心值设置为十进制的lbp值 ,画出每一个lbp值
fout<<"("<<row<<" "<<col<<")"<<center_lbp;
}
}