最近我想做一个自己的cuda版的FFT,先测试了一下cufft,512*512的变换。
数据传输用了大约7,500,000 ticks, 执行变换大约17,000,000 ticks
然后测试了一下自己的代码,涉及到2次全局内存读取,约40次shared memory的读取和写入。发现执行kernel大约
用了11,000,000 ticks.在很神奇库的实现的同时,我也为自己的代码感到不解。
是不是写入要比读取共享内存耗费百倍的时间呢?
#define MATRIX_SIZE (1 < <9)
#define BLOCK_SIZE (1 < <9)
__global__
void FFT_2D_Radix2(DATA_TYPE* dg_buffer, int N )
{
int tid, rev, pos, pre, stride = 33;
tid = threadIdx.x;
rev = bit_reverse3(tid, tail_zero_nums(N));
__shared__ DATA_TYPE s_DataR[MATRIX_SIZE]; // 512*4 = 2kB
__shared__ DATA_TYPE s_DataI[MATRIX_SIZE]; // 512*4 = 2kB
__shared__ DATA_TYPE s_CosTable[MATRIX_SIZE]; // 512*4 = 2kB
__shared__ DATA_TYPE s_SinTable[MATRIX_SIZE]; // 512*4 = 2kB
pos = tid * stride % MATRIX_SIZE;
s_DataR[pos] = dg_buffer[blockIdx.x * BLOCK_SIZE + rev]; //------------------------------------time-consuming
s_DataI[pos] = dg_buffer[N*N + blockIdx.x * BLOCK_SIZE + rev]; //-----------------------------------time-consuming
float theta = GV_2PI / N;
s_SinTable[pos] = __sinf( theta * tid );
s_CosTable[pos] = __cosf( theta * tid );
__syncthreads();
int step, w;
for(step = 1;step <N;step=step*2)
{
if(tid & step)
{
w = ( tid & ( step - 1 ) ) * stride % MATRIX_SIZE;
DATA_TYPE tempR = s_DataR[pos] * s_CosTable[w] + s_DataI[pos] * s_SinTable[w];
DATA_TYPE tempI = s_DataI[pos] * s_CosTable[w] - s_DataR[pos] * s_SinTable[w];
pre = ( tid - step ) * stride % MATRIX_SIZE;
s_DataR[pos] = s_DataR[pre] - tempR; //-----------------------------------------------time-consuming
s_DataI[pos] = s_DataI[pre] - tempI; //-----------------------------------------------time-consuming
s_DataR[pre] += tempR; //-----------------------------------------------time-consuming
s_DataI[pre] += tempI; //-----------------------------------------------time-consuming
}
__syncthreads();
}
}