cpu和gpu(CUDA)的性能比较以及优化使用的探讨

system · 2009 年11 月 18 日 03:01

各位大侠，我的笔记本电脑显卡是独立GF8400MG,128MB的显存。我做了试验，GPU计算块中最大线程数为512，支持的最大块数为21056，我用GPU计算51221056个数加1，得到的运行时间为0.375秒，其中包括前100个数的打印。而我用CPU for循环对51221066个数逐个加1，得到的运行时间为0.171秒，而且我做了很多试验，GPU运行的时间都比CPU逐个计算的运行时间多出了1倍，我就纳闷了，是这个技术提倡不成功，还是其他原因，请各位大侠探讨一下。。。

system · 2009 年11 月 18 日 03:36

能把两个代码写出来吗？

system · 2009 年11 月 18 日 03:39

下面这个是用GPU进行的并行运算的代码：（运行时间为：0.375秒）
#include <stdio.h>
#include <assert.h>
#include <time.h>

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);

// Part 2 of 2: implement the kernel
global void reverseArrayBlock( intd_a)
{ int dx=blockDim.xblockIdx.x+threadIdx.x;

d_a[dx]=d_a[dx]+1;

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)

// pointer for device memory
int *d_a;

// define grid and block size
int numThreadsPerBlock =512;

// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA/numThreadsPerBlock;
printf(“%d\n”,numBlocks);

// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);

// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );

// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock < < < dimGrid, dimBlock >>>( d_a );

// block until the device has completed
cudaThreadSynchronize();

// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError(“kernel invocation”);

// device to host copy
cudaMemcpy( h_a, d_a, memSize, cudaMemcpyDeviceToHost );

// Check for any CUDA errors
checkCUDAError(“memcpy”);

// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
//assert(h_a[i] == dimA - 1 - i );
printf("%d ",h_a[i]);
}

// free device memory
cudaFree(d_a);

// free host memory
free(h_a);

// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;

}

下面是CPU进行的循环运算的代码：（运行时间为：0.171）
#include <stdio.h>
#include <assert.h>
#include <time.h>

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)

// allocate host memory
size_t memSize = 51221056 sizeof(int);
h_a = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);
for( int j=0; j < (dimA/2) ; ++j )
{

h_a[j]=h_a[j]+1;

}

// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
//assert(h_a[i] == dimA - 1 - i );
printf("%d ",h_a[i]);
}

// free host memory
free(h_a);

// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;

}

system · 2009 年11 月 18 日 03:40

CPU代码有点问题，以前只计算了一半，今天我改了，全部计算用的时间是0.281秒，也比GPU并行运算要快啊！
代码如下：
#include <stdio.h>
#include <assert.h>
#include <time.h>

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)

// allocate host memory
size_t memSize = 51221056 sizeof(int);
h_a = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);
for( int j=0; j < dimA ; ++j )
{

h_a[j]=h_a[j]+1;

}

// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
printf("%d ",h_a[i]);
}

// free host memory
free(h_a);

// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;

}