global_ void crossconstruct(uchar imgdat,uchar arm) //480 640
{
shared uchar imgrow[MAX_W3];
shared uchar imgrowpre[MAX_W3];
shared uchar imgrowlast[MAX_W3];
//
short x=threadIdx.x;
short y=blockIdx.x;
int tid=yblockDim.x+x;
uchar i;
uchar l=0,r=0,u=0,d=0;
uchar aa[4];
imgrow =imgdat[3yIMG_W+x];
imgrow[IMG_W+x] =imgdat[3yIMG_W+IMG_W+x];
imgrow[2IMG_W+x]=imgdat[3yIMG_W+2IMG_W+x];
__syncthreads();
uchar blue =imgrow[3x];
uchar green=imgrow[3x+1];
uchar red =imgrow[3*x+2];
for(i=1;i<L1&&i<=x;i++)
{
if(colordis(imgrow,blue,green,red,x-i)>=(i<L2?Tao1:Tao2))
{
break;;
}
}
aa[0] = i-1;
for(i=1;i<L1&&i<IMG_W-x;i++)
{
if(colordis(imgrow,blue,green,red,x+i)>=(i<L2?Tao1:Tao2))
{
break;
}
}
aa[1] = i-1;
for(i=1;i<L1&&i<=y;i++)
{
if(colordis(imgdat,blue,green,red,tid-i*IMG_W)>=(i<L2?Tao1:Tao2))
{
break;;
}
}
aa[2] = i-1;
for(i=1;i<L1&&i<IMG_H-y;i++)
{
if(colordis(imgdat,blue,green,red,tid+i*IMG_W)>=(i<L2?Tao1:Tao2))
{
break;
}
}
aa[3] = i-1;
arm[tid]=aa[0];
arm[tid+IMG_HIMG_W]=aa[1];
arm[tid+2IMG_HIMG_W]=aa[2];
arm[tid+3IMG_H*IMG_W]=aa[3];
}
不要后面四个
arm[tid]=aa[0];
arm[tid+IMG_HIMG_W]=aa[1];
arm[tid+2IMG_HIMG_W]=aa[2];
arm[tid+3IMG_H*IMG_W]=aa[3];
整个程序效率大概是0.05ms左右,加上了就20ms左右
crossconstruct<<<blockH,threadW>>>(imgL,dev_armL ); BLOCKH为480 threadw为640
cudaMalloc((void **)&dev_armL,HW4);
就算是全局变量效率也没那么低吧
arm[tid]=aa[0];
arm[tid+IMG_HIMG_W]=aa[0];
arm[tid+2IMG_HIMG_W]=aa[0];
arm[tid+3IMG_H*IMG_W]=aa[0];
换为这样大概是6ms
换为这样:
arm[tid]=1;
arm[tid+IMG_HIMG_W]=1;
arm[tid+2IMG_HIMG_W]=1;
arm[tid+3IMG_H*IMG_W]=1;
0.079ms
快要疯了,求救