各位大侠,我的笔记本电脑显卡是独立GF8400MG,128MB的显存。我做了试验,GPU计算块中最大线程数为512,支持的最大块数为21056,我用GPU计算51221056个数加1,得到的运行时间为0.375秒,其中包括前100个数的打印。而我用CPU for循环对51221066个数逐个加1,得到的运行时间为0.171秒,而且我做了很多试验,GPU运行的时间都比CPU逐个计算的运行时间多出了1倍,我就纳闷了,是这个技术提倡不成功,还是其他原因,请各位大侠探讨一下。。。
能把两个代码写出来吗?
下面这个是用GPU进行的并行运算的代码:(运行时间为:0.375秒)
#include <stdio.h>
#include <assert.h>
#include <time.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 2 of 2: implement the kernel
global void reverseArrayBlock( intd_a)
{ int dx=blockDim.xblockIdx.x+threadIdx.x;
d_a[dx]=d_a[dx]+1;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)
// pointer for device memory
int *d_a;
// define grid and block size
int numThreadsPerBlock =512;
// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA/numThreadsPerBlock;
printf(“%d\n”,numBlocks);
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock < < < dimGrid, dimBlock >>>( d_a );
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError(“kernel invocation”);
// device to host copy
cudaMemcpy( h_a, d_a, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError(“memcpy”);
// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
//assert(h_a[i] == dimA - 1 - i );
printf("%d ",h_a[i]);
}
// free device memory
cudaFree(d_a);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;
}
下面是CPU进行的循环运算的代码:(运行时间为:0.171)
#include <stdio.h>
#include <assert.h>
#include <time.h>
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)
// allocate host memory
size_t memSize = 51221056 sizeof(int);
h_a = (int *) malloc(memSize);
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);
for( int j=0; j < (dimA/2) ; ++j )
{
h_a[j]=h_a[j]+1;
}
// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
//assert(h_a[i] == dimA - 1 - i );
printf("%d ",h_a[i]);
}
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;
}
CPU代码有点问题,以前只计算了一半,今天我改了,全部计算用的时间是0.281秒,也比GPU并行运算要快啊!
代码如下:
#include <stdio.h>
#include <assert.h>
#include <time.h>
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
clock_t start, finish;
double duration;
start = clock();
// pointer for host memory and size
int h_a,transfer;
int dimA = 51221056; // 256K elements (1MB total)
// allocate host memory
size_t memSize = 51221056 sizeof(int);
h_a = (int *) malloc(memSize);
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf(“%d “,h_a[i]);
}
printf(”\n”);
for( int j=0; j < dimA ; ++j )
{
h_a[j]=h_a[j]+1;
}
// verify the data returned to the host is correct
for (int i = 0; i < 100; i++)
{
printf("%d ",h_a[i]);
}
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf(“Correct!\n”);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( “the time is %f seconds\n”, duration );
return 0;
}