cuda 计算pi值 结果为负值

代码如下

#include <stdio.h>
#include <cuda.h>
#define NBIN 10000000
#define NUM_BLOCK 30 // Number of thread blocks
#define NUM_THREAD 8 // Number of threads per block
int tid;
float pi = 0;// Kernel that executes on the CUDA device
global void cal_pi(float sum, int nbin, float step, int nthreads, int nblocks)
{
int i;
float x;
int idx = blockIdx.x
blockDim.x+threadIdx.x; // Sequential thread index across the blocks
for (i=idx; i< nbin; i+=nthreads*nblocks)
{
x = (i+0.5)step;
sum[idx] += 4.0/(1.0+x
x);
}
}
// Main routine that executes on the hostint
main(void)
{
dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions
dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions
float sumHost, sumDev; // Pointer to host & device arrays
float step = 1.0/NBIN; // Step size
size_t size = NUM_BLOCK
NUM_THREAD
sizeof(float); //Array memory size
sumHost = (float )malloc(size); // Allocate array on host
cudaMalloc((void **) &sumDev, size); // Allocate array on device
// Initialize array in device to 0 cudaMemset(sumDev, 0, size);
// Do calculation on device
cal_pi <<<dimGrid, dimBlock>>> (sumDev, NBIN, step, NUM_THREAD, NUM_BLOCK); // call CUDA kernel
// Retrieve result from device and store it in host array
cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
for(tid=0; tid<NUM_THREAD
NUM_BLOCK; tid++)
pi += sumHost[tid];
pi *= step;
// Print results
printf(“PI = %f\n”,pi);
// Cleanup
free(sumHost);
cudaFree(sumDev);
return 0;
}

结果为负值 郁闷啊 :cry:

你的sum[idx] += 4.0/(1.0+x*x); 累加的时候,sun[idx]没有初始化,所以应该出现溢出的现象,你在这之前加上sum[idx]=0就可以啦

你的main函数既然有return 0,所以最好在main前面加上int,这些属于规范问题吧