#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
global void sum_kernel(int *g_input, int *g_output, int *g_mid)
{
extern shared int s_data;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
s_data[threadIdx.x] = g_input[idx];
__syncthreads();
for(int dist = blockDim.x/2; dist >0; dist /= 2)
{
if(threadIdx.x < dist)
s_data[threadIdx.x] += s_data[threadIdx.x + dist];
__syncthreads();
}
if(threadIdx.x == 0)
g_output[blockIdx.x] = s_data[0];
g_mid[idx] = s_data[threadIdx.x];
}
int main()
{
unsigned int n = 2048;
unsigned int num_bytes = n * sizeof(int);
unsigned int block_dim = 256;
unsigned int num_blocks = n / block_dim;
int *hdata = (int *)malloc(num_bytes);
int *h_a = (int *)malloc(num_bytes);
for(int i = 0; i < n; i++)
h_a = 1;
int d_a, d_output, d_mid;
cudaMalloc((void)&d_a, num_bytes);
cudaMalloc((void*)&d_output, num_blocks * sizeof(int));
cudaMalloc((void**)&d_mid, num_bytes);
cudaMemcpy(d_a, h_a, num_bytes, cudaMemcpyHostToDevice);
sum_kernel<<<num_blocks, block_dim, num_bytes>>>(d_a, d_output, d_mid);
cudaMemcpy(hdata, d_mid, num_bytes, cudaMemcpyDeviceToHost);
printf(“The first step:\n”);
for(int i = 0; i < n; i++)
printf("%d ",hdata);
sum_kernel<<<1, num_blocks, num_blocks * sizeof(int)>>>(d_output, d_output,d_mid);
cudaMemcpy(hdata, d_mid, num_bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(h_a, d_output, num_blocks*sizeof(int), cudaMemcpyDeviceToHost);
printf(“\nThe second step:\n”);
for(int i = 0; i < n; i++)
printf(“%d “,hdata);
printf(”\n”);
for(int i = 0; i < num_blocks; i++)
printf(“%d “,h_a);
printf(”\n”);
printf(“%d\n”, h_a[0]);
cudaFree(d_a);
cudaFree(d_output);
free(h_a);
return 0;
}
当unsigned int n = (256,512,1024,2048)的时候,运行结果都正确,但是当n=4096时,程序就不对,而且每次运行结果都不一样,每运行一次,和都比上一次运行多4501。
程序在Tesla C1060上运行的。
请教是什么原因?