cuda下速度比较慢,NVVP工具不能正常使用

版主您好:
刚接触cuda编程,下面是自己试着用cuda来优化图像纹理特征提取(LBP)代码,有cpu和gpu两部分,显卡是GTX480, 可能因为自己对线程和块的分配不合理,速度不理想(低于cpu版本),于是试用NVVP来分析调试,提示(Results: no timeline Application timeline is required for the analysis )(注:本想上传截图,试过几次传不去),求指导。1:代码中应该在哪些方面再做优化,
2:多维线程和块的使用。(试过没成功)
3:帮忙分析下nvvp不能正常使用的原因,之前试过别的exe可行。

非常感谢!!

代码如下
// sample.cpp : 定义控制台应用程序的入口点。

#include “cv.h”
#include “highgui.h”
#include
#include<stdlib.h>
#include
using namespace std;
using namespace cv;

void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage); //cpu

global void getimageLBPhist2txtgpu(int *d_Matrix,int d_Matrix_lbp,int width,int height) //gpu
{
int cow = threadIdx.x;
int row = blockIdx.x;
if(threadIdx.x<width)
{
int center=0;
int center_lbp=0;
center =d_Matrix[row
width+cow]; //返回单通道数组的指定元素 块 3X3

	if (center >= d_Matrix[(row-1)*width+cow-1])
		{
			center_lbp += 1;
		}
	if (center >= d_Matrix[(row-1)*width+cow])
		{
			center_lbp += 2;
		}
	if (center >= d_Matrix[(row-1)*width+cow+1])
		{
			center_lbp += 4;
		}
	if (center >= d_Matrix[row*width+cow-1])
		{
			center_lbp += 8;
		}
	if (center >= d_Matrix[row*width+cow+1])
		{
			center_lbp += 16;
		}
	if (center >= d_Matrix[(row+1)*width+cow-1])
		{
			center_lbp += 32;
		}
	if (center >=d_Matrix[(row+1)*width+cow])
		{
			center_lbp += 64;
		}
	if (center >= d_Matrix[(row+1)*width+cow+1])
		{
			center_lbp += 128;
		}

d_Matrix_lbp[row*width+cow]=center_lbp;

}
return;

}

int main()
{

int use_gpu=1;//true;//;false
int width;
int height;
int lbp_bins = 10; //直方图12个bin 维度可以自由设置
IplImage* img=cvLoadImage(“aa.jpg”);
if(img==NULL)
{
cout<<“load image error”;
}

IplImage* m_cvImage = cvCreateImage(cvGetSize(img), 8, 1);   //灰度图像
IplImage* m_cvLBPImage = cvCreateImage(cvGetSize(img), 8, 1);//LBP值图像



if (img->nChannels == 3)
{
	cvCvtColor(img, m_cvImage, CV_BGR2GRAY);//色彩空间转换(灰色空间)
}

width=m_cvImage->width; //463
height=m_cvImage->height; //288


cvNamedWindow("cpu");	
cvShowImage("cpu",m_cvImage);
cvWaitKey(9);	

if(use_gpu)
{
ofstream fout(“gpu.txt”); //将结果输出到txt中
int* d_Matrix;
int* d_Matrix_lbp;
int h_Matrix=(int)malloc(heightwidthsizeof(int));
for(int row=0;row<height;row++)
for (int col=0;col<width;col++)
{
h_Matrix[row*width+col]=cvGetReal2D(m_cvImage, row, col);
}

	cudaMalloc( &d_Matrix,width*height*sizeof(int) );
	cudaMalloc( &d_Matrix_lbp,width*height*sizeof(int) );
	cudaMemcpy(d_Matrix,h_Matrix,width*height*sizeof(int),cudaMemcpyHostToDevice); 
	getimageLBPhist2txtgpu<<<288,463>>>(d_Matrix,d_Matrix_lbp,width,height); //ceil(width*height/1024)~~130.2
	cudaMemcpy(h_Matrix,d_Matrix_lbp,width*height*sizeof(int),cudaMemcpyDeviceToHost); 

	fout<<"gpu....."<<endl;;
	for(int row=0;row<height;row++)
	{
		for (int col=0;col<width;col++)
		{
			fout<<"("<<row<<" "<<col<<")"<<h_Matrix[row*width+col]<<"  ";
			cvSetReal2D(m_cvLBPImage, row, col, h_Matrix[row*width+col]);  
		}
		
	}
	cudaFree(d_Matrix); 
	free(h_Matrix); 
	cudaDeviceReset(); 

}

else
{
getimageLBPhist2txt(m_cvImage ,m_cvLBPImage);
}

	float lbp_ranges[] = { 0, 255 };
	float* pb_ranges = lbp_ranges;
	CvHistogram* hist_lbp = cvCreateHist( 1, &lbp_bins, CV_HIST_ARRAY, &pb_ranges, 1 );//生成直方图
	cvCalcHist( &m_cvLBPImage, hist_lbp, 0, 0 );  //产生直方图
	cvNormalizeHist(hist_lbp,1.0);             //归一化

	CvMat *histogram2=cvCreateMat( lbp_bins,1,CV_64FC1);
	for(int i1=0 ; i1< histogram2->rows; i1++)
	{
		histogram2->data.fl[i1] = 0.0;
	}
	cout<<"lbp_value............."<<endl;
	for(int i3 = 0; i3< histogram2->rows; i3++)
	{
		histogram2->data.fl[i3] = (float)cvGetReal1D(hist_lbp->bins, i3);

		float lbp_value=histogram2->data.fl[i3];
    
		cout<<lbp_value<<"  ";
	}

cout<<“…”<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseHist(&hist_lbp);

cout<<endl<<".........end.............."<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseImage(&img);


return 0;

}

//cpu 提取lbp特征
void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage )

{
ofstream fout(“cpu.txt”); //将结果输出到txt中

int center=0;
int center_lbp=0;
//计算每个点的LBP值
fout<<"cpu................"<<endl;
for (int row=1; row < m_cvImage->height-1; row++)
	for (int col=1; col < m_cvImage->width-1; col++)
	{           
		center = cvGetReal2D(m_cvImage, row, col);          //返回单通道数组的指定元素           块 3X3
		center_lbp = 0;
		if (center >= cvGetReal2D(m_cvImage, row-1, col-1))
		{
			center_lbp += 1;
		}
		if (center >= cvGetReal2D(m_cvImage, row-1, col))
		{
			center_lbp += 2;
		}
		if (center >= cvGetReal2D(m_cvImage, row-1, col+1))
		{
			center_lbp += 4;
		}
		if (center >= cvGetReal2D(m_cvImage, row, col-1))
		{
			center_lbp += 8;
		}
		if (center >= cvGetReal2D(m_cvImage, row, col+1))
		{
			center_lbp += 16;
		}
		if (center >= cvGetReal2D(m_cvImage, row+1, col-1))
		{
			center_lbp += 32;
		}
		if (center >= cvGetReal2D(m_cvImage, row+1, col))
		{
			center_lbp += 64;
		}
		if (center >= cvGetReal2D(m_cvImage, row+1, col+1))
		{
			center_lbp += 128;
		}
		cvSetReal2D(m_cvLBPImage, row, col, center_lbp);       //将中心值设置为十进制的lbp值  ,画出每一个lbp值
	fout<<"("<<row<<" "<<col<<")"<<center_lbp;
	
	}

}

楼主您好:

您的kernel存在大量错误和BUG。将直接挂掉。
例如以下地方均可能导致越界:
(1)行:if (center >= d_Matrix[(row-1)width+cow-1])
(2)行: if (center >= d_Matrix[(row-1)width+cow])
(3)行: if (center >= d_Matrix[(row-1)width+cow+1])
(4)行:if (center >= d_Matrix[row
width+cow-1])
(5)行: if (center >= d_Matrix[row
width+cow+1])
(6)行: if (center >= d_Matrix[row
width+cow+1])
(7)行: if (center >= d_Matrix[(row+1)*width+cow-1])
(8)行: if (center >=d_Matrix[(row+1)*width+cow])
(9)行: if (center >= d_Matrix[(row+1)*width+cow+1])

这9处均会导致访存越界。
请确保他们符合您的设计。

在您修正了BUG后,kernel能运行起来后,您才应当再去考虑优化。

感谢来访。

感谢版主的指导:
我加了判断句给图像边界直接赋值,避免了越界,结果和之前一样,程序可以运行的,能出结果,但是速度很慢,NVVP不能分析程序

#include “cv.h”
#include “highgui.h”
#include
#include<stdlib.h>
#include
using namespace std;
using namespace cv;

void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage); //cpu

global void getimageLBPhist2txtgpu(int *d_Matrix,int d_Matrix_lbp,int width,int height) //gpu
{
int cow = threadIdx.x;
int row = blockIdx.x;
if(threadIdx.x<width)
{
int center=0;
int center_lbp=0;
center =d_Matrix[row
width+cow]; //返回单通道数组的指定元素 块 3X3
if(row==0&&row==height&&cow==0&&cow==width)
{
center_lbp=0;
}
else
{

if (center >= d_Matrix[(row-1)*width+cow-1])
{
center_lbp += 1;
}
if (center >= d_Matrix[(row-1)width+cow])
{
center_lbp += 2;
}
if (center >= d_Matrix[(row-1)width+cow+1])
{
center_lbp += 4;
}
if (center >= d_Matrix[row
width+cow-1])
{
center_lbp += 8;
}
if (center >= d_Matrix[row
width+cow+1])
{
center_lbp += 16;
}
if (center >= d_Matrix[(row+1)*width+cow-1])
{
center_lbp += 32;
}
if (center >=d_Matrix[(row+1)*width+cow])
{
center_lbp += 64;
}
if (center >= d_Matrix[(row+1)width+cow+1])
{
center_lbp += 128;
}
}
d_Matrix_lbp[row
width+cow]=center_lbp;

}
return;
}

int main()
{

int use_gpu=1;//true;//;false
int width;
int height;
int lbp_bins = 10; //直方图12个bin 维度可以自由设置
IplImage* img=cvLoadImage(“aa.jpg”);
if(img==NULL)
{
cout<<“load image error”;
}

IplImage* m_cvImage = cvCreateImage(cvGetSize(img), 8, 1); //灰度图像
IplImage* m_cvLBPImage = cvCreateImage(cvGetSize(img), 8, 1);//LBP值图像

if (img->nChannels == 3)
{
cvCvtColor(img, m_cvImage, CV_BGR2GRAY);//色彩空间转换(灰色空间)
}

width=m_cvImage->width; //463
height=m_cvImage->height; //288

cvNamedWindow(“cpu”);
cvShowImage(“cpu”,m_cvImage);
cvWaitKey(9);

if(use_gpu)
{
ofstream fout(“gpu.txt”); //将结果输出到txt中
int* d_Matrix;
int* d_Matrix_lbp;
int h_Matrix=(int)malloc(heightwidthsizeof(int));
for(int row=0;row<height;row++)
for (int col=0;col<width;col++)
{
h_Matrix[row*width+col]=cvGetReal2D(m_cvImage, row, col);
}

cudaMalloc( &d_Matrix,widthheightsizeof(int) );
cudaMalloc( &d_Matrix_lbp,widthheightsizeof(int) );
cudaMemcpy(d_Matrix,h_Matrix,widthheightsizeof(int),cudaMemcpyHostToDevice);
getimageLBPhist2txtgpu<<<288,463>>>(d_Matrix,d_Matrix_lbp,width,height); //ceil(widthheight/1024)~~130.2
cudaMemcpy(h_Matrix,d_Matrix_lbp,width
height*sizeof(int),cudaMemcpyDeviceToHost);

fout<<“gpu…”<<endl;;
for(int row=0;row<height;row++)
{
for (int col=0;col<width;col++)
{
fout<<“(”<<row<<" “<<col<<”)"<<h_Matrix[rowwidth+col]<<" ";
cvSetReal2D(m_cvLBPImage, row, col, h_Matrix[row
width+col]);
}

}
cudaFree(d_Matrix);
free(h_Matrix);
cudaDeviceReset();

}

else
{
getimageLBPhist2txt(m_cvImage ,m_cvLBPImage);
}

float lbp_ranges = { 0, 255 };
float* pb_ranges = lbp_ranges;
CvHistogram* hist_lbp = cvCreateHist( 1, &lbp_bins, CV_HIST_ARRAY, &pb_ranges, 1 );//生成直方图
cvCalcHist( &m_cvLBPImage, hist_lbp, 0, 0 ); //产生直方图
cvNormalizeHist(hist_lbp,1.0); //归一化

CvMat *histogram2=cvCreateMat( lbp_bins,1,CV_64FC1);
for(int i1=0 ; i1< histogram2->rows; i1++)
{
histogram2->data.fl[i1] = 0.0;
}
cout<<“lbp_value…”<<endl;
for(int i3 = 0; i3< histogram2->rows; i3++)
{
histogram2->data.fl[i3] = (float)cvGetReal1D(hist_lbp->bins, i3);

float lbp_value=histogram2->data.fl[i3];

cout<<lbp_value<<" ";
}

cout<<“…”<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseHist(&hist_lbp);

cout<<endl<<“…end…”<<endl;
cvReleaseImage(&m_cvLBPImage);
cvReleaseImage(&m_cvImage);
cvReleaseImage(&img);

return 0;
}

//cpu 提取lbp特征
void getimageLBPhist2txt(IplImage *m_cvImage,IplImage *m_cvLBPImage )

{
ofstream fout(“cpu.txt”); //将结果输出到txt中

int center=0;
int center_lbp=0;
//计算每个点的LBP值
fout<<“cpu…”<<endl;
for (int row=1; row < m_cvImage->height-1; row++)
for (int col=1; col < m_cvImage->width-1; col++)
{
center = cvGetReal2D(m_cvImage, row, col); //返回单通道数组的指定元素 块 3X3
center_lbp = 0;
if (center >= cvGetReal2D(m_cvImage, row-1, col-1))
{
center_lbp += 1;
}
if (center >= cvGetReal2D(m_cvImage, row-1, col))
{
center_lbp += 2;
}
if (center >= cvGetReal2D(m_cvImage, row-1, col+1))
{
center_lbp += 4;
}
if (center >= cvGetReal2D(m_cvImage, row, col-1))
{
center_lbp += 8;
}
if (center >= cvGetReal2D(m_cvImage, row, col+1))
{
center_lbp += 16;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col-1))
{
center_lbp += 32;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col))
{
center_lbp += 64;
}
if (center >= cvGetReal2D(m_cvImage, row+1, col+1))
{
center_lbp += 128;
}
cvSetReal2D(m_cvLBPImage, row, col, center_lbp); //将中心值设置为十进制的lbp值 ,画出每一个lbp值
fout<<“(”<<row<<" “<<col<<”)"<<center_lbp;

}

}

LZ您好:

“ if(row==0&&row==height&&cow==0&&cow==width)
{
center_lbp=0;
}”

您这么改是不行的,请您仔细考虑这里的逻辑。

以及,您这么写代码,真的“能出结果”么?

楼主你太浮躁,

请确定你修复了BUG再考虑优化吧。

你自己都没修复好BUG。还乱说“结果正确“,你糊弄谁啊?

先些正确,再将正确的算法优化。
否则你优化出来的也是“错的更快的代码”。

您觉得呢?

SORRY 二位版主:( ,我想还是明天冷静仔细的重新整理下代码再请教二位,二位教导的是,小弟受用了:(