这是一个很简单的异步执行的代码,将sum中的每个数拷进gpu然后平方再拷出来,发现结果是0.代码如下。似乎根本没有拷进gpu啊。请问这个为何。
int data[DATA_SIZE];
int sum[DATA_SIZE];
void GenerateNumbers(int *number, int size)
{
for(int i = 0; i < size; i++) {
number[i] = rand() % 5;
}
}
global static void sumOfSquares(int num,clock_t time)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
int i;
if(tid == 0) time[bid] = clock();
for(i = bid * THREAD_NUM + tid; i < DATA_SIZE/ nStreams; i += BLOCK_NUM * THREAD_NUM) {
num[i]= num[i] * num[i];
}
if(tid == 0) time[bid + BLOCK_NUM] = clock();
}
int main(void)
{
GenerateNumbers(data, DATA_SIZE);
int* gpudata;
clock_t* time;
int i,size,offset;
cudaMalloc((void**) &gpudata, sizeof(int) * DATA_SIZE);
cudaMalloc((void**) &time, sizeof(clock_t) * BLOCK_NUM * 2);
cudaStream_t stream[nStreams];
for(i = 0;i < nStreams;i ++)
cudaStreamCreate(&stream[i]);
size=DATA_SIZE*sizeof(int)/nStreams;
//数据异步拷贝到设备端
for(i = 0;i < nStreams;i ++)
{
offset = i * DATA_SIZE / nStreams;
cudaMemcpyAsync(gpudata + offset ,data + offset ,size,cudaMemcpyHostToDevice,stream[i]);
}
for(i = 0;i < nStreams;i ++)
{
offset = i * DATA_SIZE / nStreams;
sumOfSquares<<<BLOCK_NUM, THREAD_NUM, 0,stream[i]>>>(gpudata + offset, time);
}
for(i = 0;i < nStreams;i ++)
{
offset = i * DATA_SIZE / nStreams;
cudaMemcpyAsync(sum + offset, gpudata + offset, size,cudaMemcpyDeviceToHost,stream[i]);
}
cudaFree(gpudata);
cudaFree(time);
int final_sum = 0;
for(int i = 0; i < DATA_SIZE; i++) {
final_sum += sum[i];
}