我自己定义了一个结构体typedef struct BJLabel { int flag; int row; int xs; int xe;}, 想实现的功能是把一个矩阵里每行连续的几个数用这几个数的最小值替换,row表示在矩阵的行数,xs表示这几个连续排列的数的起始位置,xe表示这几个连续数结束的位置,我把这个计算过程放在gpu中运算,但传回cpu端结果却不正确
cpu代码
void compareResult(unsigned intdata1,BJLabel c,unsigned int w,unsigned int h,unsigned int step,unsigned intnum )
{
#define data1(x,y) data1[xstep+y]
/***将一行中连续的非零数统一成一个数/
int t=0;
int numflag=0;
int midflag=0;
for(i=0;i<h;i++)
{
for(j=1;j<w;j++)
{
if(data1(i,j-1)!=0)
{
numflag=numflag+1;
midflag=j-1;
if((data1(i,j)-data1(i,j-1))==(t+1))
{
numflag=numflag-1;
data1(i,j)=data1(i,j-1);
t=t+1;
}
else
{
c[numflag].row=i;
c[numflag].xs=midflag-t;
c[numflag].xe=midflag;
c[numflag].flag=1;
t=0;
}
}
}
}
*num=numflag;
}
gpu代码:
global void CCL(unsigned int* d,BJLabel*c,unsigned int w,unsigned int h,unsigned int step,unsigned int num)
{
const unsigned int tx=threadIdx.x;
const unsigned int ty=threadIdx.y;
const unsigned int bx=blockIdx.x;
const unsigned int by=blockIdx.y;
const unsigned int row=blockDim.y*by+ty;
const unsigned int col=blockDim.x*bx+tx;
int t=0;
int numflag=0;
int midflag=0;
int i,j;
for(i=1;i<w;i++)
{
if(row<h && col<w)
{
if(d[rowstep+i-1]!=0)
{
numflag=numflag+1;
midflag=i-1;
if((d[rowstep+i]-d[rowstep+i-1])==(t+1))
{
numflag=numflag-1;
d[rowstep+i]=d[row*step+i-1];
t=t+1;
}
else
{
c[numflag].row=row;
c[numflag].xs=midflag-t;
c[numflag].xe=midflag;
c[numflag].flag=1;
t=0;
}
}
}
}
虽然接触CUDA也有大半年了,感觉还是没入门,请高手指教一下究竟错在哪里,谢谢!