一个很简单的代码,但是用CDUA编程总是达不到预期结果。比如二值化返回值,只有255和0两种。
原始图像大小1024*1024,阈值54, THREAD_X= THREAD_Y=16
global void binaryKernel(
float * imageBinary,
const int imagew,
const int imageh,
const int nThreshold)
{
int x = blockIdx.x * THREAD_X + threadIdx.x;
int y = blockIdx.y * THREAD_Y + threadIdx.y;
*(imageBinary + y * imagew + x) < nThreshold ? 0 : 255;
}
void binaryGPU(
float* pImageBuffer, //输入是原始灰度图像,输出时指向阈值化图像的数据缓冲区
const int nImageHeight, //原始图像的高度
const int nImageWidth, //原始图像的宽度
const int nThreshold) //二值化的阈值
{
float *d_binary;
cutilSafeCall( cudaMalloc((void *)&d_binary, nImageWidth * nImageHeight * sizeof(float)) );
cutilSafeCall( cudaMemcpy(d_binary, pImageBuffer, nImageWidth * nImageHeight sizeof(float), cudaMemcpyHostToDevice));
dim3 thread_N(16,16);
dim3 block_N(nImageWidth / THREAD_X, nImageHeight / THREAD_Y);
binaryKernel<<<block_N, thread_N>>>(d_binary, nImageWidth, nImageHeight, nThreshold);
cutilSafeCall( cudaThreadSynchronize() );
cutilSafeCall( cudaMemcpy(pImageBuffer, d_binary, nImageWidth * nImageHeight* sizeof(float), cudaMemcpyDeviceToHost));
//测试返回值
int i,j;
for (j = 0; j < nImageHeight; j++)
{
for (i =0; i < nImageWidth; i++)
{
assert(*(pImageBuffer + j * nImageWidth + i) == 255 || *(pImageBuffer + j * nImageWidth + i) == 0);
}
}
cudaFree(d_binary);
}