编了一个程序验证cuda的加速效果,在num_threads=32时,执行结果和cpu的差不多,但是时间是cpu的二倍,如果改为64,花3000ms,而且在执行下面红色部分会报错,按照以前问版主的同样问题的解释,说是内核崩溃,但是如果是把num_threads改为512,直接就说执行时间过长了,我尝试把内核的四重循环拿一重出来,但是结果还是一样
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include
#define H 5
#define Rr 5
#define R 10
#define Pi 3.1415
// includes CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples
using std::cout;
using std::endl;
using std::cin;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
extern “C”
void computeGold(float *reference, float *idata, const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_idata input data in global memory
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
device float Find1(int i,int j,int k ,int n)
{
float num=blockDim.x;
float x1=R/numicos(2Pi/numj);
float y1=R/numisin(2Pi/numj);
float x2=Rr/numkcos(2Pi/numn);
float y2=Rr/numksin(2Pi/numn);
float S=(x1-x2)(x1-x2)+(y1-y2)(y1-y2)+HH;
return(HH/(PiSS));
}
global void
testKernel(int i,float * save)
{
// shared memory
// the size is determined by the host application
extern shared float sdata1;
float s1=0,s2=0;
// access thread id
const int tid = threadIdx.x;
if(i==1&&tid==0)
*save=0;
// access number of threads in this block
const float num_threads = blockDim.x;
for(int j=1;j<=num_threads;j++)//R圆弧度对应的数 j
{
for(int k=1;k<=num_threads;k++)//r圆半径对应的数 k
{
sdata1[tid]=Find1(i,j,k,tid+1)(2Pi/num_threads);
__syncthreads();
for(int n=num_threads/2;n>=1;n/=2)
{
if(tid<n)
{
sdata1[tid]+=sdata1[tid+n];
}
__syncthreads();
}
if(tid==0)
{
s1+=sdata1[0](Rrk/num_threads)*(Rr/num_threads);
}
}
if(tid==0)
{
s2+=s1*(2*Pi/num_threads);
s1=0;
}
}
if(tid==0)
{
save+=s2(R/num_threadsi)(R/num_threads);
s2=0;
}
// write data to global memory
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char **argv)
{
bool bTestResult = true;
printf(“%s Starting…\n\n”, argv[0]);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
unsigned int num_threads ;
cout<<“请输入线程数”<<endl;
cin>>num_threads;
unsigned int mem_size = sizeof(float) * num_threads;
// allocate host memory
//float *h_idata = (float *) malloc(mem_size);
// initalize the memory
/* for (unsigned int i = 0; i < num_threads; ++i)
{
h_idata[i] = (float) i;
}*/
// allocate device memory
//float *d_idata;
//checkCudaErrors(cudaMalloc((void **) &d_idata,sizeof( float)));
// copy host memory to device
// checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size,
// cudaMemcpyHostToDevice));
// allocate device memory for result
float * save;
checkCudaErrors(cudaMalloc((void **) &save, sizeof( float)));
StopWatchInterface *timer = 0;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// setup execution parameters
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1);
// execute the kernel
for(int i=1;i<=num_threads;i++)
{
testKernel<<< grid, threads, mem_size >>>( i,save);
}
// check if kernel execution generated and error
getLastCudaError(“Kernel execution failed”);
// allocate mem for the result on host side
float *h_odata = (float *) malloc(sizeof( float));
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, save, sizeof(float) ,cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
printf(“Processing time: %f (ms)\n”, sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
float *h_out = (float *) malloc(sizeof( float));
h_out=h_odata/(PiRR);
std::cout<<*h_out<<endl;
// compute reference solution
// float *reference = (float *) malloc(mem_size);
// computeGold(reference, h_idata, num_threads);
// check result
/if (checkCmdLineFlag(argc, (const char **) argv, “regression”))
{
// write file for regression test
sdkWriteFile(“./data/regression.dat”, h_odata, num_threads, 0.0f, false);
}
else
{
// custom output handling when no regression test running
// in this case check if the result is equivalent to the expected soluion
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
}/
// cleanup memory
free(h_odata);
free(h_out);
checkCudaErrors(cudaFree(save));
cudaDeviceReset();
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
因为num_threads=32时,结果正确,所以应该不会有什么逻辑错误,请问对一次内核函数的执行,有没有时间限制,要怎么修改时间限制?
[/i]