谢谢哥们。我也看有说kernel运行时间过长。。。但是这之前我就八个kernel:
calcLeftGradientD<<<blockCount,BLOCK_SIZE>>>(dLeftData, width, height, dLeftGradient);
calcRightGradientD<<<blockCount,BLOCK_SIZE>>>(dLeftData, width, height, dRightGradient);
calcUpGradientD<<<blockCount,BLOCK_SIZE>>>(dLeftData, width, height, dUpGradient);
calcDownGradientD<<<blockCount,BLOCK_SIZE>>>(dLeftData, width, height, dDownGradient);
calcMaxGradientD<<<blockCount,BLOCK_SIZE>>>(result1, width, height, dLeftGradient);
calcMaxGradientD<<<blockCount,BLOCK_SIZE>>>(result2, width, height, dRightGradient);
calcMaxGradientD<<<blockCount,BLOCK_SIZE>>>(result3, width, height, dUpGradient);
calcMaxGradientD<<<blockCount,BLOCK_SIZE>>>(result4, width, height, dDownGradient);
其中calcMaxGradientD<<<blockCount,BLOCK_SIZE>>>(result4, width, height, dDownGradient)是我根据深入浅出里面的例程改的比较计算数组的最大元素,具体代码如下:
global static void calcMaxGradientD(float *result,int width,int height,unsigned char *dGradient)
{
shared int shared[1024];
int i;
shared[threadIdx.x] = 0;
for(i = blockIdx.x * blockDim.x + threadIdx.x;i < width*height;i += blockIdx.x * blockDim.x)
{
shared[threadIdx.x] = dGradient[blockIdx.x * blockDim.x + threadIdx.x];
}
__syncthreads();
if(threadIdx.x < 512)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 512])
shared[threadIdx.x] = shared[threadIdx.x + 512];
}
__syncthreads();
if(threadIdx.x < 256)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 256])
shared[threadIdx.x] = shared[threadIdx.x + 256];
}
__syncthreads();
if(threadIdx.x < 128)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 128])
shared[threadIdx.x] = shared[threadIdx.x+128];
}
__syncthreads();
if(threadIdx.x < 64)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 64])
shared[threadIdx.x] = shared[threadIdx.x+64];
}
__syncthreads();
if(threadIdx.x < 32)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 32])
shared[threadIdx.x] = shared[threadIdx.x+32];
}
__syncthreads();
if(threadIdx.x < 16)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 16])
shared[threadIdx.x] = shared[threadIdx.x+16];
}
__syncthreads();
if(threadIdx.x < 8)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 8])
shared[threadIdx.x] = shared[threadIdx.x+8];
}
__syncthreads();
if(threadIdx.x < 4)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 4])
shared[threadIdx.x] = shared[threadIdx.x + 4];
}
__syncthreads();
if(threadIdx.x < 2)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 2])
shared[threadIdx.x] = shared[threadIdx.x+2];
}
__syncthreads();
if(threadIdx.x < 1)
{
if(shared[threadIdx.x] < shared[threadIdx.x + 1])
shared[threadIdx.x] = shared[threadIdx.x + 1];
}
__syncthreads();
if(threadIdx.x == 0)
{
result[blockIdx.x] = shared[0];
}
}
这个kernel我按照例程改写的,运算时间也应该不多啊,数组有480*272个数。请指教。