《code by example》中说除非线程块中每个线程都执行了__syncthreads(),否则没有线程能执行__syncthreads()之后的指令,即如果一些线程没有执行__syncthreads(),处理器将挂起,程序会崩溃。代码是这样描述的:
(正确的)
while(iHalf!=0)
{
if (cacheIndex<iHalf)
{
cachePerBlock[cacheIndex]+=cachePerBlock[cacheIndex+iHalf];
}
__syncthreads();
iHalf/=2;
}
(不正确的)
while(iHalf!=0)
{
if (cacheIndex<iHalf)
{
cachePerBlock[cacheIndex]+=cachePerBlock[cacheIndex+iHalf];
__syncthreads();
}
iHalf/=2;
}
但是我使用两种方式都运行了一遍,其实是可以的,不知是我错了还是书中的描述错了?
附完整代码:
#include <cuda_runtime.h>
#include <stdio.h>
#define N 12346 //N必须为偶数,否则进行归约计算时会计算不到线程计算数组中最后一个数;
#define mathMin(a,b) (a<b?a:b)
#define threadNumPerBlock 256
#define blockNum mathMin(32,(N+threadNumPerBlock-1)/threadNumPerBlock)
#define sum_squares(x) (x*(x+1)(2x+1)/6)
//如果计算的是浮点数,得到的结果会不准确,不知道是为什么;
global void CalcInnerProducts(/float/int* pDevA,/float/int* pDevB,/float/int* pDevC)
{
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex=threadIdx.x;
shared /float/int cachePerBlock[threadNumPerBlock];
/float/int temp=0;
while(tid<N)
{
temp+=pDevA[tid]*pDevB[tid];
tid+=blockDim.x*gridDim.x;
}
cachePerBlock[cacheIndex]=temp;
__syncthreads();
int iHalf=blockDim.x/2;
while(iHalf!=0)
{
if (cacheIndex<iHalf)
{
cachePerBlock[cacheIndex]+=cachePerBlock[cacheIndex+iHalf];
}
__syncthreads();
iHalf/=2;
}
if (cacheIndex==0)
{
pDevC[blockIdx.x]=cachePerBlock[cacheIndex];
}
return;
}
int main()
{
/float/int a[N],b[N],c[blockNum];
/float/int *pDevA,*pDevB,*pDevC;
/float/int fTemp=0,fResult;
for (int i=0;i<N;++i)
{
a[i]=i;
b[i]=2*i;
}
cudaMalloc((void**)&pDevA,N*sizeof(float));
cudaMalloc((void**)&pDevB,N*sizeof(float));
cudaMalloc((void**)&pDevC,blockNum*sizeof(float));
cudaMemcpy(pDevA,a,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(pDevB,b,N*sizeof(float),cudaMemcpyHostToDevice);
CalcInnerProducts<<<blockNum,threadNumPerBlock>>>(pDevA,pDevB,pDevC);
cudaMemcpy(c,pDevC,blockNum*sizeof(float),cudaMemcpyDeviceToHost);
for(int i=0;i<blockNum;++i)
{
fTemp+=c[i];
}
fResult=2*sum_squares((/*float*/int)(N-1));
if (fTemp==fResult)
{
printf("Success!\r\n");
}
cudaFree(pDevA);
cudaFree(pDevB);
cudaFree(pDevC);
return 0;
}