大家好,我想对如下核函数
global void computer_P1_with_P2(compx *P1,compx *P2,double *pupil,const double zercoef,double wf65,double theta,int row,int col,int high)
{
int i=blockIdx.xblockDim.x+threadIdx.x;
int j=blockIdx.yblockDim.y+threadIdx.y;
int idx=icol+j;
if(i<row&&j<col)
{
double wavefront1 = 0;
for(int k=2;k<Zs;k++)
{
wavefront1 += wf65[kcolrow+idx] * zercoef[k];
}
double wavefront2 = wavefront1 + theta[idx];
P1[idx].real = pupil[idx] * cos(wavefront1);
P1[idx].imag = pupil[idx] * sin(wavefront1);
P2[idx].real = pupil[idx] * cos(wavefront2);
P2[idx].imag = pupil[idx] * sin(wavefront2);
}
}
优化为
global void computer_P1_with_P2(compx *P1,compx *P2,double *pupil,const double zercoef,double wf65,double theta,int row,int col,int high)
{
int i=blockIdx.xblockDim.x+threadIdx.x;
int j=blockIdx.yblockDim.y+threadIdx.y;
int idx=icol+j;
shared double zercoef_s[Zs];
if(idx<Zs)
zercoef_s[idx] = zercoef[idx];
__syncthreads();
if(i<row&&j<col)
{
double wavefront1 = 0;
for(int k=2;k<Zs;k++)
{
wavefront1 += wf65[kcolrow+idx] * zercoef_s[k];
}
double wavefront2 = wavefront1 + theta[idx];
P1[idx].real = pupil[idx] * cos(wavefront1);
P1[idx].imag = pupil[idx] * sin(wavefront1);
P2[idx].real = pupil[idx] * cos(wavefront2);
P2[idx].imag = pupil[idx] * sin(wavefront2);
}
}
可是结果不对了,请问问题出在什么地方呢?