shared Memory问题

system · 2010 年3 月 18 日 02:09

核函数为
const int bid=blockIdx.x;
const int tid=threadIdx.x;
extern shared float temp;
if (tid==0)
{
temp[0]=0.0f;
}
for (int i=tid;i<1000;i+=blockDim.x)
{
temp[0]+=1.0;
}
我的blockDim。x为256为什么算出来的temp[0]=4.0应该是1000才对啊！这是为什么？

system · 2010 年3 月 18 日 02:17

首先，算法需要修改，其次，你没有做适当的同步

system · 2010 年3 月 18 日 02:28

global static void ADDTEST(float resut)
{
const int bid=blockIdx.x;
const int tid=threadIdx.x;
extern shared float temp[];
if (tid==0)
{
temp[0]=0.0f;
}
for (int i=tid;i<1000;i+=blockDim.x)
{
temp[0]+=1.0;
}
if (tid==0)
{
resut[bid]=temp[0];
}
}
int main(int argc, char argv)
{
float gpu_result;
float result[10];
cudaMalloc((void*) &gpu_result,sizeof(float)*10);
ADDTEST<<<10,256,sizeof(float)*2>>>(gpu_result);
cudaMemcpy(result,gpu_result,sizeof(float)*10,cudaMemcpyDeviceToHost);
return 0;
}
整个程序是这样的，请问我该怎么改才能使得temp[0]为1000

system · 2010 年3 月 18 日 02:47

你的结果得到4.0是完全正确的！理论上可能都不是4！我把我修改后的程序给你！
#include <stdio.h>

global static void ADDTEST(int *resut)
{
const int bid=blockIdx.x;
const int tid=threadIdx.x;
extern shared int temp;

if (tid==0)
{
	temp[0]=0;
}
__syncthreads();
for (int i=tid;i<1000;i+=blockDim.x)
{
	atomicAdd(temp,1);
}
__syncthreads();
if (tid==0)
{
	resut[bid]=temp[0];
}

}
int main(int argc, char* argv)
{
int gpu_result;
int result[10];
cudaMalloc((void*) &gpu_result,sizeof(int)*10);
ADDTEST<<<10,256,sizeof(int)*2>>>(gpu_result);
cudaMemcpy(result,gpu_result,sizeof(int)*10,cudaMemcpyDeviceToHost);

for(int i=0;i<10;i++)
	printf("%d\n",result[i]);
return 0;

}

system · 2010 年3 月 18 日 02:49

方法很多，就是reduce操作，不过

global static void ADDTEST(float *resut)
{
const int bid=blockIdx.x;
const int tid=threadIdx.x;
extern shared float temp;

temp[tid]=0.0f;

unsigned int i=tid;
while( i<1000 ){
temp[tid]+=1.0;
i+=blockDim.x;
}

for( unsigned int n=( blockDim.x>>1 ); n>0; n>>=1 ){
if( tid<n ){
tem[ tid ]+=temp[ tid+n ];
} __syncthreads();
}

if (tid==0)
{
resut[bid]=temp[0];
}
}
int main(int argc, char* argv)
{
float gpu_result;
float result[10];
cudaMalloc((void*) &gpu_result,sizeof(float)*10);
ADDTEST<<<10,256,sizeof(float)*256>>>(gpu_result);
cudaMemcpy(result,gpu_result,sizeof(float)*10,cudaMemcpyDeviceToHost);
return 0;
}

system · 2010 年3 月 18 日 03:00

运行的结果是32还是不对

system · 2010 年3 月 18 日 03:06

对了，是1000

system · 2010 年3 月 18 日 03:21

多谢各位