贴出全部代码可能能更好地表达我的意思,供各位大侠参考。
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <Windows.h>
#define THREAD_NUM_PER_BLOCK 512 //每个块的线程数
#define MAX_BLOCK 65535 //NVS 5400m允许的最大块数
//用cuda计算单个采样点的多项式值
nGrid: 输入,本采样点所在的第几个Grid
pfSamples: 输入,存放采样点的值
pfVolumes: 输出,存放采样点多项式的值
nSampleNum: 输入,采样点个数
pfPolyCoefs:输入,多项式的系数
nCoefNum: 输入,多项式的维数///
global void cudaGetPolyVol( int nGrid, float * pfSamples, float * pfVolumes, int nSampleNum, float * pfPolyCoefs, int nCoefNum )
{
int nTid = nGrid * MAX_BLOCK * THREAD_NUM_PER_BLOCK + blockIdx.xTHREAD_NUM_PER_BLOCK + threadIdx.x;
if( nTid < nSampleNum )
{
float x = pfSamples[ nTid ];
float y = pfPolyCoefs[ nCoefNum - 1 ];
for( int i=nCoefNum-2; i>=0; i-- )
{
y = yx + pfPolyCoefs[ i ];
}
pfVolumes[ nTid ] = y;
}
__syncthreads();
}
int main()
{
//创建采样点、多项式系数数组,并用随机数赋值
int nSampleNum = 2000000*6, nCoefNum = 1000;
float * pfSamples = (float *)malloc( sizeof(float)*nSampleNum );
float * pfGpuVolumes = (float *)malloc( sizeof(float)*nSampleNum );
float * pfCpuVolumes = (float *)malloc( sizeof(float)*nSampleNum );
float * pfCoefs = (float *)malloc( sizeof(float)*nCoefNum );
if( 0 == pfSamples || 0 == pfGpuVolumes || 0 == pfCpuVolumes || 0 == pfCoefs ) goto Error;
srand( (unsigned)time( NULL ) );
for( int i=0; i<nSampleNum; i++ )
{
pfSamples[ i ] = (rand()/32767.0f) * (rand()/32767.0f);
pfGpuVolumes[ i ] = 0.0f;
pfCpuVolumes[ i ] = 0.0f;
}
for( int i=0; i<nCoefNum; i++ ) pfCoefs[ i ] = rand()/10000.0f;
//以下用CUDA计算多项式的值,并估算耗费时间
int nBlockNum = nSampleNum/THREAD_NUM_PER_BLOCK + 1;
int nGridNum = nBlockNum/MAX_BLOCK + 1;
float * pfDeviceSamples = 0;
float * pfDeviceVolumes = 0;
float * pfDeviceCoefs = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice( 0 );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaSetDevice failed! Do you have a CUDA-capable GPU installed?”);
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc( (void**)&pfDeviceSamples, nSampleNum * sizeof(float) );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
cudaStatus = cudaMalloc( (void**)&pfDeviceVolumes, nSampleNum * sizeof(float) );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
cudaStatus = cudaMalloc( (void**)&pfDeviceCoefs, nCoefNum * sizeof(float) );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy( pfDeviceSamples, pfSamples, nSampleNum * sizeof(float), cudaMemcpyHostToDevice );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
cudaStatus = cudaMemcpy( pfDeviceVolumes, pfGpuVolumes, nSampleNum * sizeof(float), cudaMemcpyHostToDevice );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
cudaStatus = cudaMemcpy( pfDeviceCoefs, pfCoefs, nCoefNum * sizeof(float), cudaMemcpyHostToDevice );
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
printf( “Getting using GPU…” );
DWORD dwGpuBeginning = GetTickCount();
for( int i=0; i<nGridNum; i++ )
{
cudaGetPolyVol<<<MAX_BLOCK, THREAD_NUM_PER_BLOCK>>>( i, pfDeviceSamples, pfDeviceVolumes, nSampleNum,
pfDeviceCoefs, nCoefNum );
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy( pfGpuVolumes, pfDeviceVolumes, nSampleNum * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
printf( “Ok. Time taken %d ms\n”, GetTickCount()-dwGpuBeginning );
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceReset failed!”);
return 1;
}
//以下用CPU计算多项式的值,并估算耗时
printf( "Getting using CPU..." );
DWORD dwCpuBeginning = GetTickCount();
for( int i=0; i<nSampleNum; i++ )
{
float x = pfSamples[ i ];
float y = pfCoefs[ nCoefNum - 1 ];
for( int k=nCoefNum-2; k>=0; k-- )
{
y = y*x + pfCoefs[ k ];
}
pfCpuVolumes[ i ] = y;
}
printf( "Ok. Time taken %d ms\n", GetTickCount()-dwCpuBeginning );
//将最后若干个计算结果输出,检查GPU的结果是否和CPU的一致
for( int i=0; i<200; i++ )
{
int nSeries = nSampleNum-1-i;
printf( "%15.10f,%15.10f,%15.10f\n", pfSamples[ nSeries ], pfGpuVolumes[ nSeries ], pfCpuVolumes[ nSeries ] );
}
Error:
cudaFree( pfDeviceSamples );
cudaFree( pfDeviceVolumes );
cudaFree( pfDeviceCoefs );
if( pfSamples ) free( pfSamples );
if( pfGpuVolumes ) free( pfGpuVolumes );
if( pfCpuVolumes ) free( pfCpuVolumes );
if( pfCoefs ) free( pfCoefs );
return 0;
}