各位同事您好,想请教一个关于CUDA归约的问题

在学习cuda的过程中遇到了一个有关于并行归约的困难,希望有朋友能解答一下,不胜感激。我遇到的问题如下:在展开归约时(展开因子为8),末尾的几个数据总是丢失。导致我的归约结果比正确值小一些。代码如下:主机端:
cudaMemcpy(dev_p, p, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_re12, re12, N * sizeof(float), cudaMemcpyHostToDevice);
RN << <blocksPerGrid/8, threadsPerBlock >> >(dev_p, dev_re12, dev_partial_rn);
cudaMemcpy(partial_rn, dev_partial_rn, (blocksPerGrid/8) * sizeof(float), cudaMemcpyDeviceToHost);
rn = 0;
for (int i = 0; i<blocksPerGrid/8; i++)
{
rn += partial_rn[i];
}
其中blocksPerGrid=256,N=34329.
设备端:
global void RN(float *p, float *re12, float *rn)
{
shared float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x * 8;
int cacheIndex = threadIdx.x;
float temp = 0;
if(tid + 7 * blockDim.x < N)
{
float a1 = p[tid] * re12[tid];
float a2 = p[tid + blockDim.x] * re12[tid + blockDim.x];
float a3 = p[tid + 2 * blockDim.x] * re12[tid + 2 * blockDim.x];
float a4 = p[tid + 3 * blockDim.x] * re12[tid + 3 * blockDim.x];
float a5 = p[tid + 4 * blockDim.x] * re12[tid + 4 * blockDim.x];
float a6 = p[tid + 5 * blockDim.x] * re12[tid + 5 * blockDim.x];
float a7 = p[tid + 6 * blockDim.x] * re12[tid + 6 * blockDim.x];
float a8 = p[tid + 7 * blockDim.x] * re12[tid + 7 * blockDim.x];
temp = a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8;
}
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
//unrolling warp
if (blockDim.x >= 1024 && cacheIndex < 512)
{
cache[cacheIndex] += cache[cacheIndex + 512];
}
__syncthreads();
if (blockDim.x >= 512 && cacheIndex < 256)
{
cache[cacheIndex] += cache[cacheIndex + 256];
}
__syncthreads();
if (blockDim.x >= 256 && cacheIndex < 128)
{
cache[cacheIndex] += cache[cacheIndex + 128];
}
__syncthreads();
if (blockDim.x >= 128 && cacheIndex < 64)
{
cache[cacheIndex] += cache[cacheIndex + 64];
}
__syncthreads();
if (cacheIndex < 32)
{
volatile float *vcache = cache;
vcache[cacheIndex] += vcache[cacheIndex + 32];
vcache[cacheIndex] += vcache[cacheIndex + 16];
vcache[cacheIndex] += vcache[cacheIndex + 8];
vcache[cacheIndex] += vcache[cacheIndex + 4];
vcache[cacheIndex] += vcache[cacheIndex + 2];
vcache[cacheIndex] += vcache[cacheIndex + 1];
}
if (cacheIndex == 0)
rn[blockIdx.x] = cache[0];
}

被这个问题困扰很久了,课题止步不前,请有能力的前辈不吝赐教,再次感谢~!