代码是将全局存储器中的d_text先拷贝到共享存储器,再从共享存储器考到全局内存另一个地方d_result,但结果不对麻烦大牛帮我看看,谢谢
global void Match_Kernel(unsigned char *d_text, unsigned int *d_result)
{
shared unsigned char tmp[32][128];//每个block32个线程,每个线程处理128个char
unsigned int tid = threadIdx.x;
unsigned int bid = blockIdx.x;
unsigned int *s_p = (unsigned int *)tmp
unsigned int *g_t = (unsigned int *)d_text;
unsigned int *g_r = (unsigned int *)d_result;
unsigned int glb_index, loc_index;
loc_index = tid;
glb_index = bid*blockDim.x*32 + loc_index;
for(int i = 0; i < 32; i++)
{
s_p[loc_index] = g_t[glb_index];
loc_index += 32;//128/sizeof(int)=32
glb_index += 32;
}
__syncthreads();
loc_index = tid*32;
glb_index = bid*blockDim.x*32 + loc_index;
for(int i = 0; i < 32; i++)
{
g_r[glb_index] = s_p[loc_index];
loc_index ++;
glb_index ++;
}
}