shared memory使用

我对一幅图像做了2维FFT变换后,得到复数,想对其求模值,分别放在global memory和shared memory中进行计算,但结果是global memory的速度要比shared memory的速度快,不知道是不是我用shared memory方法不对,请高手指教,谢谢!!!

图像大小8192*2048

global memory的代码:
sqrtCaculate<<<512,256>>>(d_signal,d_p,81922048);
static global void sqrtCaculate( Complex
a, int* p,int size)
{
const int numThreads=blockDim.xgridDim.x;
const int threadID=blockIdx.x
blockDim.x+threadIdx.x;
for(int i=threadID;i<size;i+=numThreads)
{
p[i]=ComplexSqrt(a[i]);
if(p[i]>255)
p[i]=255;

}	

}

static device host inline int ComplexSqrt(Complex b)
{
int c;
c=(int)sqrt(b.xb.x+b.yb.y);
c=c/2000;
return c;
}

shared memory代码:

BLOCK_DIM=16;
size_w=8192;
size_h=2048
dim3 grid(size_w/BLOCK_DIM,size_h/BLOCK_DIM,1)
dim3 threads(BLOCK_DIM,BLOCK_DIM,1);
static global void sqrtCaculate( Complex* a, int* p,unsigned int size_w,unsigned int size_h)
{
shared Complex block[BLOCK_DIM][BLOCK_DIM+1];
unsigned int xIndex=blockIdx.xBLOCK_DIM+threadIdx.x;
unsigned int yIndex=blockIdx.y
BLOCK_DIM+threadIdx.y;
unsigned threadID=yIndex*size_w+xIndex;
if ((xIndex<8192)&&(yIndex<2048))
{

	block[threadIdx.x][threadIdx.y]=a[threadID];
}
__syncthreads();

if ((xIndex<8192)&&(yIndex<2048))
{
	p[threadID]=ComplexSqrt(block[threadIdx.x][threadIdx.y]);
}
__syncthreads();

if (p[threadID]>255)
{
	p[threadID]=255;
}

}
static device host inline int ComplexSqrt(Complex b)
{
int c;
c=(int)sqrt(b.xb.x+b.yb.y);
c=c/2000;
return c;
}

从代码看应该会慢一点,但是慢得不多吧

最后两个if可以一起,而且最后一个同步时没必要的

是的,1ms左右,能说一下为什么吗?按理说应该是放在shared memory中更快一些,是不是我用错了,能不能说一下怎么去改进,谢谢!!!

谢谢,我想问一下什么时候要同步呢?