斑竹你好,我从CUDA BY EXAMPLE上摘抄了一段代码,稍微进行了下改编,kernel如下
global void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
if (x == 0) left++;
if (x == DIM-1) right–;
int top = offset - DIM;
int bottom = offset + DIM;
if (y == 0) top += DIM;
if (y == DIM-1) bottom -= DIM;
float t, l, c, r, b;
if (dstOut)
{
t = tex1Dfetch(texIn,top);
l = tex1Dfetch(texIn,left);
r = tex1Dfetch(texIn,right);
b = tex1Dfetch(texIn,bottom);
c = tex1Dfetch(texIn,offset);
}
else
{
t = tex1Dfetch(texOut,top);
l = tex1Dfetch(texOut,left);
r = tex1Dfetch(texOut,right);
b = tex1Dfetch(texOut,bottom);
c = tex1Dfetch(texIn,offset);
}
if (y == 0||y == DIM-1)
{
dst[offset] = (l+r+4t+4b-4*c)/6;
}
else
{
dst[offset] = (l+r+(1.0-1.0/DIM/2/((float)y/DIM))*b+(1.0-1.0/DIM/2/((float)y/DIM))*t)/4;
//dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
}
我在声明float变量的地方设置了断点,命中后执行5次单步后,也就是执行完c = tex1Dfetch(texIn,offset);后,Nsight提示说"trying to step invalid wrap",并且有时候会产生黑屏一下然后又恢复的情况,请问这是怎么回事?
另外,除了Nsight本身的说明手册以外,还有什么手册或者书籍是讲关于CUDA调试这方面的?最好是中文的···看英文的总感觉自己会遗漏或是搞错什么东西···