不好意思,我是CUDA的初入门者。
最近编程中遇到个问题, 一直困扰了2个月。不知道怎么弄。
我写了一个CUDA 子程序,在 C 的主程序中循环调用。 同时把GPU内存的数据输出到CPU。
当循环数小于谋个数(譬如 N=15), 子程序调没问题,数据传递也没问题。
但当循环到大于15时, 数据传递出了问题
显示以下错误:
First-chance exception at 0x000007fefd34aa7d in cudaCubicRotate2D.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012f9a0…
跪求大虾帮助!!
主程序如下
I’m beginner of Cuda coding. I am being stacked by cudaError_enum at memory location xxxxx;
I deveoped a global cuda functionin, and try to called it in a loop (0=< k <= N), and transfter data out from GPU to CPU.
this function works when k<15. and I can transfter data out from GPU to CPU using cudaMemcpy. When K goes to 15, the cuda function still works. While, cudaMemcpy function does not work, it gives out the error:
“First-chance exception at 0x000007fefd34aa7d in cudaCubicRotate2D.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012f9a0.”
Could anyone please help me ? I has been stopped by this for couple months. Any help will be highly appreicated.
[codebox]
main code.
…
const dim3 blockSize(16, 16);
const dim3 gridSize(imageSize.x / blockSize.x, imageSize.y / blockSize.y);
for(k = 0; k < 100; k++){
warp_kernel<<<gridSize, blockSize>>>(output, voxel, coordX, coordY, coordZ, imageSize, k, threshold);
cudaMemcpy(OutImage, output, nrOfBytes, cudaMemcpyDeviceToHost);
for(j = 0; j < imageSize.y; j++){
for(i = 0; i < imageSize.x; i++){
OutputImage[k * slice + j * imageSize.x +i] = OutImage[j * imageSize.x + i];
}
}
}
…
function
///// Warp the refernece image into different phase ///////////
global void
warp_kernel(float* output, float* voxel, float* coordX, float* coordY, float* coordZ, uint3 imageSize, uint k, int threshold)
{
//long k = 0;
uint i = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint j = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint tt = __umul24(j, imageSize.x) + i;
long nx, ny, nz;
float temp, temp1, temp2, temp3, temp0;
float *p0, *px, *py, *pz, *px0, *py0, *pz0;
long ix, iy, iz, ixL, ixU, iyL, iyU, izL, izU;
long slice = imageSize.x * imageSize.y;
/////////////// warp the first slice ////////////////////
ixL = max(0, i - threshold);
ixU = min (imageSize.x - 1, i + threshold);
iyL = max(0, j - threshold);
iyU = min (imageSize.y - 1, j + threshold);
izL = max(0, k - threshold);
izU = min (imageSize.z - 1, k + threshold);
ix = (long)i;
iy = (long)j;
iz = (long)k;
temp1 = coordX[iz * slice + iy * imageSize.x + ix];
temp2 = coordY[iz * slice + iy * imageSize.x + ix];
temp3 = coordZ[iz * slice + iy * imageSize.x + ix];
temp = (temp1 - (float)i) * (temp1 - (float)i) + (temp2 - (float)j) * (temp2 - (float)j) + (temp3 - (float)k) * (temp3 - (float)k);
for (nz = izL; nz <= izU; nz++){
px = coordX + (ptrdiff_t)(nz * slice);
py = coordY + (ptrdiff_t)(nz * slice);
pz = coordZ + (ptrdiff_t)(nz * slice);
for (ny = iyL; ny <= iyU; ny++){
px0 = px + (ptrdiff_t)(ny * imageSize.x);
py0 = py + (ptrdiff_t)(ny * imageSize.x);
pz0 = pz + (ptrdiff_t)(ny * imageSize.x);
for (nx = ixL; nx <= ixU; nx++){
float temp11 = px0[nx];
float temp22 = py0[nx];
float temp33 = pz0[nx];
temp0 = (temp11 - i) * (temp11 - i) + (temp22 - j) * (temp22 - j) + (temp33 - k) * (temp33 - k);
if( temp0 < temp ){
ix = nx;
iy = ny;
iz = nz;
temp = temp0;
}
}
}
}
p0 = voxel + (ptrdiff_t)(iz * slice + iy * imageSize.x + ix);
output[tt] = p0[0];
}
';[/codebox]
[ 本帖最后由 tangql2010 于 2010-9-8 06:06 编辑 ]