以下是我调用 nppiMean_32f_C1R的例子,环境是WIN7 64bit + VS2010 + CUDA5.5:
float pF = new float[20003000];
float dF = NULL;
cudaMalloc((void*)&dF, 20003000sizeof(float));
srand(time(NULL));
for(int i = 0; i < 20003000;i++)
pF[i] = rand()%100;
cudaMemcpy( dF, pF, 20003000 * sizeof(float), cudaMemcpyHostToDevice );
int n=0;
NppiSize size={2000,3000};
nppiMeanGetBufferHostSize_32f_C1R(size, &n);
Npp8u aa=NULL;
cudaMalloc((void*)&aa, n);
Npp64f dMean;
nppiMean_32f_C1R(dF, 2000sizeof(float), size, aa, &dMean);
cudaDeviceSynchronize();
printf(“%f\n”, dMean);
cudaFree(dF);
double d=0;
for(int j=0;j<20003000;j++)
d+=pF[j];
printf(“%f\n”, d/(2000*3000));
delete pF;
GPU和CPU得出的结果不一样,不知道哪里用错了?请高手指点下,谢谢!
[/i]