谢谢您。(cuda代码的执行跟我原来想象的不一样,又很难跟踪,每改一次都会出现新的问题)
我现在想想,可能gpu_time_used的值是对的,因为我用cuda调试时是无法看到打印后gpu_time_used值,而查看gpu_time_used是退出cuda调试,用VC自己的调试功能,这已经不是刚才的执行过程了,所以导致结果不同。
但我现在把代码改成cuda并行执行后,代码如下:(随机生成1048567个0-9的数,计算其平方和)
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define DATA_SIZE 1048567
#define THREAD_NUM 256
int data[DATA_SIZE];
bool InitCUDA(){
int count;
cudaGetDeviceCount(&count) ;
if ( count == 0)
{
fprintf(stderr, "There is no device.\n") ;
return false;
}
int i;
for ( i = 0 ;i < count ; i++)
{
cudaDeviceProp prop ;
if ( cudaGetDeviceProperties(& prop ,i) == cudaSuccess)
{
if ( prop.major >= 1)
{
break;
}
}
}
if ( i == count)
{
fprintf(stderr ,"There is no device supporting CUDA1.x.\n");
return false;
}
cudaSetDevice( i );
return true;
}
void generateNumbers(int* data, int size){
for (int i = 0; i < size; i++)
{
data[i] = rand()%10;
}
}
__global__ static void sumofSquares(int* num, int* result, clock_t* gpu_time){
int sum = 0;
const int tid = threadIdx.x;
const int size = DATA_SIZE/THREAD_NUM;
clock_t start;
if(tid == 0)
start = clock();
for(int i = tid ;i < (tid+1)*size; i++){
sum += num[i]*num[i];
}
result[tid] = sum;
clock_t end ;
if( tid == 0 )
{
end = clock();
*gpu_time = end-start;
}
}
int main(){
if (! InitCUDA())
{
return 0;
}
printf("Hello world ,cuda has been initialized.\n");
generateNumbers(data,DATA_SIZE);
int* gpudata;
int* gpu_result;
cudaMalloc((void**)&gpudata, sizeof(int)*DATA_SIZE);
cudaMalloc((void**)&gpu_result, sizeof(int)*THREAD_NUM);
cudaMemcpy(gpudata, data,sizeof(int)*DATA_SIZE,cudaMemcpyHostToDevice);
clock_t* gpu_time = NULL;
cudaMalloc((void**)&gpu_time, sizeof(clock_t)) ;
clock_t gpu_time_used=0;
sumofSquares<<<1, THREAD_NUM, 0>>>(gpudata,gpu_result,gpu_time);
cudaMemcpy(&gpu_time_used, gpu_time, sizeof(clock_t), cudaMemcpyDeviceToHost);
int sum[THREAD_NUM];
cudaMemcpy(sum ,gpu_result,sizeof(int)*THREAD_NUM,cudaMemcpyDeviceToHost);
for ( int i = 1 ;i < THREAD_NUM; i++)
{
sum[0]+=sum[i];
}
printf("sum(gpu)=%d, gpu_time_used=%d\n", sum[0], gpu_time_used);
cudaFree( gpu_time);
cudaFree(gpudata);
cudaFree( gpu_result);
//////////////////////////////////////////////////////////////////////////
clock_t cpu_start = clock();
int cpu_sum = 0;
for (int i = 0 ;i < DATA_SIZE; i++)
{
cpu_sum += data[i]*data[i];
}
clock_t cpu_time_used = clock() - cpu_start;
printf("sum(cpu)=%d, cpu_time_used=%d\n",cpu_sum, cpu_time_used);
return 0;
}
执行结果:
sum(gpu)=-455073962, gpu_time_used=71559570
sum(cpu)=29887568, cpu_time_used=2
难以理解:sum(gpu) 还出现了负数。但我仔细查看了代码,没有发现有不对的地方,请您帮我看下,这段代码问题出在哪里