cudaMemcpy无法拷贝数据

我运行Udacity 的Introduction to Parallel Programming课程上的一个例子:

#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE  100

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)
{
   printf("{ ");
   for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
   printf("}\n");
}

__global__ void increment_naive(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	atomicAdd(& g[i], 1);
}

int main(int argc,char **argv)
{   
   GpuTimer timer;
   printf("%d total threads in %d blocks writing into %d array elements\n",
   NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

   // declare and allocate host memory
   int h_array[ARRAY_SIZE];
   const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
 
   // declare, allocate, and zero out GPU memory
   int * d_array;
   cudaMalloc((void **) &d_array, ARRAY_BYTES);
   cudaMemset((void *) d_array, 0, ARRAY_BYTES); 

   // launch the kernel - comment out one of these
   timer.Start();
   increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
   //increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
   timer.Stop();
   
   // copy back the array of sums from GPU and print
   cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
   print_array(h_array, ARRAY_SIZE);
   printf("Time elapsed = %g ms\n", timer.Elapsed());
 
   // free GPU memory allocation and exit
   cudaFree(d_array);
   return 0;
}

发现我的机器最后得出的结果全是0(代码本身没有问题),机器配置是GT240M + vs2008 ,运行结果如图

请问是什么情况导致了这种问题?

楼主您好:

首先说,cuda runtime api是成熟的函数库,不会出现cudaMemcpy突然失效之类的危言耸听的现象。
您遇到这个现象应该首先从自身找问题所在,而不是上去用危言耸听的名字指责无辜的NVIDIA CUDA.

您的问题在于,您的kernel根本就没执行,
请看这里:
increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
您的BLOCK_WIDTH是1000, 超出了贵卡(GT240)的最大block里的线程数能力,贵卡最多在一个block里支持512个线程。

请修正此问题。

感谢来访。

谢谢指正!:slight_smile: