调用CUDA库函数CUFFT计算FFT时，当FFT点数大时结果错误

system · 2013 年5 月 20 日 12:40

我刚接触GPU通用计算，目前写好了串行的FFT程序，然后买了块GTX660显卡，安装了CUDA5.0，成功计算sample里的例子，运行正常。接下来，我想在写个基于ＣＵＤＡ的FFT并行程序，看一下能把我的FFT程序加速多少倍。先找了个cuda的FFT然后运行，但是当FFT点数大于512时，计算结果就会错误，我的对比方法是，我运行串行的FFT然后打印结果，对比结果，或者是和ＦＦＴＷ的结果进行比较，ＦＦＴＷ和我的串行ＦＦＴ结果一样，但是我写的ｃｕｄａＦＦＴ结果不一样，也就是错误。
我就又想了个办法，用CUDA做好的库函数ＣＵＦＦＴ计算结果，并且查看加速效果，写好程序后，CUFFT的计算结果，当FFT点数小时，结果有误差，当点数大时，比如FFT点数是32768，或者更大时，计算结果就会有错误，最后几个数据错误，为什么，CUDA做好的库函数不应该有这样的性能表现，那里出现了问题，我找了好久，一直没有解决，希望大家帮我看看，查找一下问题的原因，程序代码如下：

//#include “cuda_runtime.h”
//#include “device_launch_parameters.h”
#include <cufft.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include

#define NX 65536
#define BATCH 1

using namespace std;

//Complex data type
typedef float2 Complex;
int main()
{
cufftHandle plan;
cufftComplex *idata,*odata;
clock_t start,end;

//Allocate host memory for the signal
Complex  *h_signal=(Complex*)malloc(sizeof(Complex)*NX);
Complex  *h_result=(Complex*)malloc(sizeof(Complex)*NX);

//Initalize host memory for the signal
for(unsigned int i=0;i<NX;i++)
{
	h_signal[i].x=(float)i;
	h_signal[i].y=0;
}

/* Allocate device memory */
cudaMalloc((void**)&idata,sizeof(cufftComplex)*NX*BATCH);
cudaMalloc((void**)&odata,sizeof(cufftComplex)*NX*BATCH);

/* 主机设备数据传输*/
cudaMemcpy(idata,h_signal,sizeof(Complex)*NX,cudaMemcpyHostToDevice);
if(cudaGetLastError()!=cudaSuccess) {
fprintf(stderr,“Cuda error: Failed to allocate\n”);
return;
}

start=clock();
/* Create a 1D FFT plan. */	
if(cufftPlan1d(&plan,NX,CUFFT_C2C,BATCH)!=CUFFT_SUCCESS) {
	fprintf(stderr,"CUFFT error: Plan creation failed");
	return;	
}
/* Use the CUFFT plan to transform the signal in place .*/
if(cufftExecC2C(plan,idata,odata,CUFFT_FORWARD)!=CUFFT_SUCCESS) {
    fprintf(stderr,"CUFFT error:ExecC2C Forward failed");
	return;
}
end=clock();

/* Inverse transform the signal in place.
if (cufftExecC2C(plan,idata,odata,CUFFT_INVERSE)!=CUFFT_SUCCESS)
{ 
	fprintf(stderr,"CUFFT error:ExecC2C Inverse failed");
	return;
} */

if (cudaThreadSynchronize()!=cudaSuccess)
{
	fprintf(stderr,"Cuda error: Failed to synchronize\n");
	return;
}

cudaMemcpy(h_result,odata,sizeof(Complex)*NX,cudaMemcpyDeviceToHost);

/* Show the result.*/
printf("The   result   are   as   follows\n");   
for(int i=0;i<NX;i++)   
{   
	printf("%.4f",h_result[i].x);   
	if(h_result[i].y>=0.0001)   
		printf("+%.4fj\n",h_result[i].y);   
	else   if(fabs(h_result[i].y)<0.0001)   
		printf("\n");   
	else     
		printf("%.4fj\n",h_result[i].y);   
}  

cout<<"GPU use "<<(end-start)<<" ms"<<endl;
/* Destroy the CUFFT plan. */
cufftDestroy(plan);
cudaFree(idata);
cudaFree(odata);

return 0;
}