global void features_4bin(uchar *img, int srcWidth,int srcHeight,float *hist)
{
int y=threadIdx.x;
int x=blockIdx.x;
__syncthreads();
if (x<gridDim.x-1 && x>0 && y>0 && y<blockDim.x-1)
{
int block0 = blockDim.x>>2;
int block1 = gridDim.x>>2;
int block =block0*block1;
float vx0 = bins4[(x-1)&3];
int ixp = (x-2)>>2;
float vy0 = bins4[(y-1)&3];
int iyp = (y-2)>>2;
uchar *s = img + min(x, srcWidth-2)*3 + min(y, srcHeight-2)srcWidth3;
int dy = (s+srcWidth3) - (s-srcWidth3); //上下之差
int dx = (s+3) - (s-3); //左右之差
float v = sqrt((float)(dxdx + dydy));
int best_o = tex1Dfetch(texRef, (255-dy)*511+(dx+255));
atomicAdd(hist + ixpblock0 + iyp + best_oblock,(ixp >= 0 && iyp >= 0)(1.0-vx0)(1.0-vy0)v);
atomicAdd(hist + ixpblock0 + (iyp+1) + best_oblock,(ixp >= 0 && iyp< block0-1)(1.0-vx0)vy0v);
atomicAdd(hist + (ixp+1)block0 + iyp + best_oblock,(ixp< block1-1 && iyp >= 0)vx0(1.0-vy0)v);
atomicAdd(hist +(ixp+1)block0 + (iyp+1) + best_oblock,(ixp< block1-1 && iyp < block0-1)vx0vy0v);
}
}
感觉很耗时