谢谢版主
我的内核函数是这么编的,按照您的说法是将这个长度为三的数组写成三个常量的形式,不写入全局存储器,修改成__global__ void SquaresKernel(const float Img, float a1,float a2,float a3,int width,int height,floatdest)的形式吗
原来的内核函数如下:
global void SquaresKernel(const float Img, const float A,int width,int height,floatdest)
{
int x=threadIdx.x+blockIdx.xblockDim.x;
int y=threadIdx.y+blockIdx.y*blockDim.y;
int tid=y*width+x;//就是本线程处理的那个像素
if(x<width&&y<height)
{
dest[tid]=(float)((A[0]-Img[tid3])(A[0]-Img[tid3])+
(A[1]-Img[tid3+1])(A[1]-Img[tid3+1])+
(A[2]-Img[tid3+2])(A[2]-Img[tid*3+2]));
}
}
这是我调用内核函数的host端代码,您看看这么编时间是不是会卡在访存上,按照您的说法将长度为三的向量写成a1,a2,a3的形式,此时不需要这句,cudaStatus=cudaMalloc((void**)&dev_colorM,3*sizeof(float));
这么做就是将这个长度为三的数组自动写入上述存储器了吗
源代码如下:
/******************************************************************************/
cudaError_t Square(const float hos_Icols,const float hos_colorM,int width,int height,float hos_dest)
{
floatdev_Icols=NULL;
floatdev_colorM=NULL;
floatdev_dest=NULL;
//测时
cudaEvent_t start, stop;
float time;
//检测状态
cudaError_t cudaStatus;
//设置为0号设备
cudaStatus=cudaSetDevice(0);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,“cudaMalloc failed”);
goto Error;
}
//在device端分配内存(two inpot ,one output)
cudaStatus=cudaMalloc((void)&dev_Icols,widthheight3sizeof(float));
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,“cudaMalloc failed”);
goto Error;
}
cudaStatus=cudaMalloc((void**)&dev_colorM,3*sizeof(float));
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,"cudaMalloc failed");
goto Error;
}
cudaStatus=cudaMalloc((void**)&dev_dest,width*height*sizeof(float));
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,"cudaMalloc failed");
goto Error;
}
//将数据从host端复制到device端
cudaStatus=cudaMemcpy(dev_Icols,hos_Icols,width*height*3*sizeof(float),cudaMemcpyHostToDevice);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,"cudaMemcpy failed");
goto Error;
}
cudaStatus=cudaMemcpy(dev_colorM,hos_colorM,3*sizeof(float),cudaMemcpyHostToDevice);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,"cudaMemcpy failed");
goto Error;
}
//保证设备同步
cudaDeviceSynchronize();
//创建时间
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
//调用kernel函数
dim3 DimBlock(16,16,1);
dim3 DimGrid((width+DimBlock.x-1)/DimBlock.x,(height+DimBlock.y-1)/DimBlock.y,1);
SquaresKernel<<<DimGrid,DimBlock>>>(dev_Icols,dev_colorM,width,height,dev_dest);
//结束计时开始统计时间
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop);
printf("kernel take time:%fms\n",time);
//将总的时间统计出来也就是所有视频帧在此块处所花费的时间
time_total+=time;
printf("total time=%fms\n",time_total);
cudaMemcpy(hos_dest,dev_dest,width*height*sizeof(float),cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,"cudaMemcpy failed");
goto Error;
}
Error:
cudaFree(dev_Icols);
cudaFree(dev_colorM);
cudaFree(dev_dest);
return cudaStatus;
}