版主:您好! 我的程序生成的时候无错误,但是在debug的时候没有反应,没有按任意键继续…的显示。不知道什么原因,具体的思路就是使用纹理存储器实现FFT,将runtest()函数粘贴出来,如下所示:
void runTest(int argc, char **argv)
{
printf("[texture_FFT] is starting...\n");
findCudaDevice(argc, (const char **)argv);
//初始化变换核:旋转因子
Complex *W = (Complex*)malloc(sizeof(Complex) * K);
for(unsigned int i3=0;i3<K;i3++)
{
W[i3].x=cos(2*PI/K*i3);
W[i3].y=-1*sin(2*PI/K*i3);
}
//将旋转因子存储在二维矩阵中,之后再读取
Complex power[5][16]=
{{W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0],W[0]},
{W[0],W[8],W[0],W[8],W[0],W[8],W[0],W[8],W[0],W[8],W[0],W[8],W[0],W[8],W[0],W[8]},
{W[0],W[4],W[8],W[12],W[0],W[4],W[8],W[12],W[0],W[4],W[8],W[12],W[0],W[4],W[8],W[12]},
{W[0],W[2],W[4],W[6],W[8],W[10],W[12],W[14],W[0],W[2],W[4],W[6],W[8],W[10],W[12],W[14]},
{W[0],W[1],W[2],W[3],W[4],W[5],W[6],W[7],W[8],W[9],W[10],W[11],W[12],W[13],W[14],W[15]}
};
Complex *host2D = (Complex*)calloc(width*height, sizeof(Complex)); //构造查找表
cudaArray *cuArray; // CUDA数组
for(int row = 0; row < height; ++row) // 初始化内存(查找表)数据
{
for(int col = 0; col < width; ++col)
{
host2D[row*width + col] = power[row][width];
}
}
//每个像元由一个float2型元组数据构成
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float2>();
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height)); // 申请显存空间
// 将内存数据拷贝入CUDA数组,第二个和第三个参数分别表示在宽度和高度上的偏移量,数据已经初始化
checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, host2D, sizeof(Complex)*width*height, cudaMemcpyHostToDevice));
//设置纹理参数
texRef2D.addressMode[0]=cudaAddressModeClamp;
texRef2D.addressMode[1]=cudaAddressModeClamp;
texRef2D.filterMode=cudaFilterModePoint;
texRef2D.normalized=false;
checkCudaErrors(cudaBindTextureToArray(texRef2D, cuArray)); // 将显存数据和纹理绑定
//////////////////////////////////////////////////////////////////////////////////////////////////////////
//该信道化分为32个信道,要进行32次卷积,在此程序中使用一个for循环
////////////////////////////////////////////////////////////////////////////////////////////////////////////
for (unsigned int pp=0;pp<K;pp++)
{
// Allocate host memory for the filter
Complex *h_filter_kernel = (Complex*)malloc(sizeof(Complex) * FILTER_KERNEL_SIZE);
// Initalize the memory for the filter
for (unsigned int ss=0;ss<FILTER_KERNEL_SIZE; ++ss)
{
h_filter_kernel[ss].x = power0[pp][ss];
h_filter_kernel[ss].y = 0;
}
// Allocate host memory for the signal
Complex *h_signal = (Complex *)malloc(sizeof(Complex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int s=0;s<SIGNAL_SIZE; ++s)
{
h_signal[s].x =power1[pp][s];
h_signal[s].y =0;
}
// Pad signal and filter kernel
Complex *h_padded_signal;
Complex *h_padded_filter_kernel;
int new_size = PadData(h_signal, &h_padded_signal, SIGNAL_SIZE,h_filter_kernel, &h_padded_filter_kernel, FILTER_KERNEL_SIZE);
int mem_size = sizeof(Complex) * new_size;
// Allocate device memory for signal
Complex *d_signal;
checkCudaErrors(cudaMalloc((void **)&d_signal, mem_size));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal, h_padded_signal, mem_size,cudaMemcpyHostToDevice));
// Allocate device memory for filter kernel
Complex *d_filter_kernel;
checkCudaErrors(cudaMalloc((void **)&d_filter_kernel, mem_size));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_filter_kernel, h_padded_filter_kernel, mem_size,cudaMemcpyHostToDevice));
// CUFFT plan
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan, new_size, CUFFT_C2C, 1));
// Transform signal and kernel
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_filter_kernel, (cufftComplex *)d_filter_kernel, CUFFT_FORWARD));
// Multiply the coefficients together and normalize the result
ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, 1.0f / new_size);
// Check if kernel execution generated and error
getLastCudaError("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
// Transform signal back
checkCudaErrors(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
//////////////////////////////////////////////////////////////////////////////////////////////////////
//上述进行的是K个信道进行抽取和滤波,其数据长度变为2052,由于频域相乘对齐数据的缘故
//对信号进行了FFT计算,所得到的结果是复数;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Copy device memory to host
Complex *h_convolved_signal = (Complex *)malloc(sizeof(Complex) * new_size);
checkCudaErrors(cudaMemcpy(h_convolved_signal, d_signal, mem_size,cudaMemcpyDeviceToHost));
//将卷积后的结果:实部与虚部分别存储在两个二维数组中
for (unsigned int sss=0;sss<new_size;++sss)
{
powerRe[pp][sss]=h_convolved_signal[sss].x;
powerIm[pp][sss]=h_convolved_signal[sss].y;
}
//Destroy CUFFT context
checkCudaErrors(cufftDestroy(plan));
free(h_signal);
free(h_filter_kernel);
free(h_padded_signal);
free(h_padded_filter_kernel);
free(h_convolved_signal);
checkCudaErrors(cudaFree(d_signal));
checkCudaErrors(cudaFree(d_filter_kernel));
}
for (int w=0;w<(SIGNAL_SIZE+FILTER_KERNEL_SIZE/2);++w)
{
// Allocate host memory for the signal
Complex *h_signal_1 = (Complex *)malloc(sizeof(Complex) * K);
// Initalize the memory for the signal
for (unsigned int s=0;s<K; ++s)
{
h_signal_1[s].x =powerRe[s][w];
h_signal_1[s].y =powerIm[s][w];
}
//变换地址,将输入序列倒位序
for(unsigned int i4=0;i4<K;i4++)
{
Complex temp;
unsigned int k=i4;
unsigned int j4=0;
double t=(log(double(K))/log(double(2)));
while( (t–)>0 )
{
j4=j4<<1;
j4|=(k & 1);
k=k>>1;
}
if(j4>i4)
{
temp=h_signal_1[i4];
h_signal_1[i4]=h_signal_1[j4];
h_signal_1[j4]=temp;
}
}
int mem_size = sizeof(Complex) * K;
// Allocate device memory for signal
Complex *d_signal_1;
checkCudaErrors(cudaMalloc((void **)&d_signal_1, mem_size));
// Copy host memory to device
checkCudaErrors(cudaMemcpy(d_signal_1, h_signal_1, mem_size,cudaMemcpyHostToDevice));
Complex *d_signal_2;
checkCudaErrors(cudaMalloc((void **)&d_signal_2, mem_size));
//运行内核函数,其中tex2D的纹理拾取函数
FFT_T<<<1,16>>>(d_signal_1,d_signal_2,K);
// Copy device memory to host
Complex h_signal_2 = (Complex )malloc(sizeof(Complex) * K);
checkCudaErrors(cudaMemcpy(h_signal_2, d_signal_2, mem_size,cudaMemcpyDeviceToHost));
for (int t4=0;t4<K;t4++)
{
float Re_signal=(h_signal_2[t4].x)(h_signal_2[t4].x);
float Im_signal=(h_signal_2[t4].y)(h_signal_2[t4].y);
powerfft[t4][w]=sqrt(Re_signal+Im_signal);
}
free(h_signal_1);
free(h_signal_2);
checkCudaErrors(cudaFree(d_signal_1));
checkCudaErrors(cudaFree(d_signal_2));
}
FILE* fp2;
fp2 = fopen("result.txt", "w");
if (!fp2)
{
perror("cannot open file");
//exit(-1);
}
/*把二维数组的内容写入文件*/
for (unsigned int e = 0; e <K ;e++)
{
for (unsigned int f = 0; f <(SIGNAL_SIZE+FILTER_KERNEL_SIZE/2);f++)
{
fprintf(fp2, "%f ", powerfft[e][f]);
}
fputc('\n', fp2);
}
fclose(fp2);
free(W);
cudaUnbindTexture(texRef2D); // 解绑定
cudaFreeArray(cuArray); // 释放显存空间
free(host2D); // 释放内存空间
cudaDeviceReset();
exit(EXIT_SUCCESS);
}
下面的代码是FFT代码:
static global void FFT_T(Complex* const DataIn,Complex* DataOut,const unsigned int N)
{
extern shared Complex sdata;
const unsigned int tid_in_block=threadIdx.x; //线程在线程块中的位置
if (tid_in_block<N)
{
sdata[tid_in_block]=DataIn[tid_in_block];
sdata[tid_in_block+N/2]=DataIn[tid_in_block+N/2]; //将数据从global读入shared memory
__syncthreads(); //线程块中的线程同步
if (tid_in_block<N/2)
{
unsigned int p,q;
Complex Xp,XqWn;
float2 Wn;
float stage=0.0;
for (unsigned int Ns=1;Ns<N;Ns*=2)
{
p=tid_in_block/NsNs2+tid_in_block%Ns;
q=p+Ns;
Wn=tex2D(texRef2D,tid_in_block,stage++);
XqWn=ComplexMul1(sdata[q],Wn);
Xp=sdata[p];
sdata[p]=ComplexAdd(Xp,XqWn);
sdata[q]=ComplexSub(Xp,XqWn);
__syncthreads();
}
DataOut[p]=sdata[p];
DataOut[q]=sdata[q];
}
}
}
请版主给予建议,在此感谢!