本程序实现一个数组相加,分别用GPU和CPU实现,为何不一样了
#include “cuda_runtime.h”
global void AddKernel(float* A, int count,float* B)
{
int i = threadIdx.x;
//*B += A[i];
B = atomicAdd(B,A[i]);
}
int _tmain(int argc, _TCHAR argv)
{
int count = 5;
float * A = new float[count];
for (int i=0;i<count;i++)
{
A[i] = i;
}
printf(“Src\n”);
for (int i=0;i<count;i++)
{
printf(“%.2f\n”,A[i]);
}
float B = 0;
for (int i=0;i<count;i++)
{
B += A[i];
}
printf(“cpu:%.2f\n”,B);
B = 0;
float * _cuda_A = 0;
cudaMalloc((void**)&_cuda_A,countsizeof(float));
float * _cuda_B = 0;
cudaMalloc((void**)&_cuda_B,sizeof(float));cudaMemset(_cuda_B,0,sizeof(float));
cudaError_t error;
error = cudaMemcpy(_cuda_A,A,countsizeof(float),cudaMemcpyHostToDevice);
if(error!=cudaSuccess)
{
return -1;
}
AddKernel<<<1, count>>>(_cuda_A,count,_cuda_B);
cudaMemcpy(&B,_cuda_B,sizeof(float),cudaMemcpyDeviceToHost);
//释放设备上的矩阵
cudaFree(_cuda_B);
cudaFree(_cuda_A);
printf(“gpu:%.2f\n”,B);
delete A;
return 0;
}
输出:
Src
0.00
1.00
2.00
3.00
4.00
cpu:10.00
gpu:4.00
为何GPU输出不是 10了