一段简单的代码问题

代码是将全局存储器中的d_text先拷贝到共享存储器,再从共享存储器考到全局内存另一个地方d_result,但结果不对麻烦大牛帮我看看,谢谢

global void Match_Kernel(unsigned char *d_text, unsigned int *d_result)
{
shared unsigned char tmp[32][128];//每个block32个线程,每个线程处理128个char

unsigned int tid = threadIdx.x;
unsigned int bid = blockIdx.x;

unsigned int *s_p = (unsigned int *)tmp
unsigned int *g_t = (unsigned int *)d_text;
unsigned int *g_r = (unsigned int *)d_result;

unsigned int glb_index, loc_index;
loc_index = tid;
glb_index = bid*blockDim.x*32 + loc_index;


for(int i = 0; i < 32; i++)
{
	s_p[loc_index] = g_t[glb_index]; 
	loc_index += 32;//128/sizeof(int)=32
	glb_index += 32;
}
__syncthreads();

loc_index = tid*32;
glb_index = bid*blockDim.x*32 + loc_index;
for(int i = 0; i < 32; i++)
{
	g_r[glb_index] = s_p[loc_index]; 
	loc_index ++;
	glb_index ++;
}

}

共享存储器超标

谢谢,问题解决