#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void kernel(__int64* d1,double* d2,unsigned long long int*auto_t1,unsigned long long int*auto_t2)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if ( idx<128 )
{
//面积坐标积分点
d2[idx*9+0]=0.5;
d2[idx*9+1]=0.5;
d2[idx*9+2]=0;
d2[idx*9+3]=0;
d2[idx*9+4]=0.5;
d2[idx*9+5]=0.5;
d2[idx*9+6]=0.5;
d2[idx*9+7]=0;
d2[idx*9+8]=0.5;
auto_t1[idx]=unsigned long long int(123.456);
auto_t2[idx]=unsigned long long int(0.0);
atomicAdd(&(auto_t2[idx]),auto_t1[idx]);
d1[0]=__int64(auto_t2[idx]);
}
}
main()
{
__int64* d1;
cudaMalloc((void**)&d1,sizeof(__int64)*128);
cudaMemset(d1,0,sizeof(__int64)*128);
double* d2;
cudaMalloc((void**)&d2,sizeof(double)*128);
cudaMemset(d2,0,sizeof(double)*128);
unsigned long long int *auto_t1;
cudaMalloc((void**)&auto_t1,sizeof(unsigned long long int)*9*128);
cudaMemset(auto_t1,0,sizeof(unsigned long long int)*9*128);
unsigned long long int *auto_t2;
cudaMalloc((void**)&auto_t2,sizeof(unsigned long long int)*128);
cudaMemset(auto_t2,0,sizeof(unsigned long long int)*128);
kernel<<<1,128>>>(d1,d2,auto_t1,auto_t2);
__int64 ha[1];
cudaMemcpy(ha,d1,sizeof(__int64),cudaMemcpyDeviceToHost);
printf("%lld\n",ha[0]);
}
设想,输出的结果应该为123,但是实际输出值为0;
此外,当把kennel里d2值为零的代码注释掉之后,结果就正确了。
d2[idx*9+0]=0.5;
d2[idx*9+1]=0.5;
//d2[idx*9+2]=0;
//d2[idx*9+3]=0;
d2[idx*9+4]=0.5;
d2[idx*9+5]=0.5;
d2[idx*9+6]=0.5;
//d2[idx*9+7]=0;
d2[idx*9+8]=0.5;