global void KernelV(int *dev_XMatrix,double dev_V,double dev_VK)
{
int tid=threadIdx.x;
int bid=blockIdx.x;
int xIndex=bidblockDim.x+tid;
if(xIndex<XROW)
{
if(xIndex==0)
{
for(int j=0;j<XCOL;j++)
{
dev_V[xIndex]=dev_V[xIndex]+dev_XMatrix[j]dev_XMatrix[j+xIndexXCOL]dev_VK[j];
__syncthreads();
}
}
else
{
for(int j=0;j<XCOL;j++)
{
dev_V[xIndex]=dev_V[xIndex]+dev_XMatrix[j]dev_XMatrix[j+xIndexXCOL]dev_VK[j];
__syncthreads();
dev_V[xIndexXROW]=dev_V[xIndexXROW]+dev_XMatrix[j+xIndexXCOL]*dev_XMatrix[j]*dev_VK[j];
}
__syncthreads();
}
}
}
global void KernelV2(int *dev_ZZ,double *dev_V,double dev_VKK)
{
int tid=threadIdx.x;
int bid=blockIdx.x;
int xIndex=bidblockDim.x+tid;
if(xIndex<XROW)
{
if(xIndex==0)
{
for(int j=0;j<(XCOL*(XCOL-1)/2);j++)
{
dev_V[xIndex]=dev_V[xIndex]+dev_ZZ[jXROW]dev_ZZ[xIndex+jXROW]dev_VKK[j];
__syncthreads();
}
}
else
{
for(int j=0;j<(XCOL(XCOL-1)/2);j++)
{
dev_V[xIndex]=dev_V[xIndex]+dev_ZZ[jXROW]dev_ZZ[xIndex+jXROW]dev_VKK[j];
__syncthreads();
dev_V[xIndexXROW]=dev_V[xIndexXROW]+dev_ZZ[xIndex+jXROW]dev_ZZ[jXROW]*dev_VKK[j];
}
__syncthreads();
}
}
}
global void KernelVOne(double *dev_V,double dev_VK,int dev_XMatrix,int j,double s0)
{
int tid=threadIdx.x;
int bid=blockIdx.x;
int xIndex=bidblockDim.x+tid;
if(xIndex<XROW)
{
if(xIndex==0)
{
dev_V[xIndex]=dev_V[xIndex]+dev_XMatrix[j]dev_XMatrix[xIndexXCOL+j](dev_VK-s0);
__syncthreads();
}
else
{
dev_V[xIndex]=dev_V[xIndex]+dev_XMatrix[j]dev_XMatrix[xIndexXCOL+j](dev_VK-s0);
__syncthreads();
dev_V[xIndexXROW]=dev_V[xIndexXROW]+dev_XMatrix[xIndex*XCOL+j]dev_XMatrix[j](dev_VK-s0);
__syncthreads();
}
}
}
int main(void)
{
culaStatus status;
status = culaInitialize();
cudaError_t cudaStatus;
clock_t t1,t2,t3,t4,t5,t6,t7,t8,t9; //设定计时器变量
readGen_sim();
readPhe_sim();
double V=(double)malloc(sizeof(double)XROWXROW);
for(int i=0;i<XROWXROW;i++)
{
V[i]=0;
}
double VK[XCOL];
for(int i=0;i<XCOL;i++)
{
VK[i]=1e-30;
}
t1=clock(); //设定计时器变量t1开始计时
int dev_XMatrix;
double dev_V,dev_VK;
cudaMalloc((void)&dev_XMatrix,XSIZE);
cudaMalloc((void*)&dev_V,XROWXROWsizeof(double));
cudaMalloc((void**)&dev_VK,XCOLsizeof(double));
cudaMemcpy(dev_XMatrix,XMatrix,XROWXCOLsizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_V,V,XROWXROWsizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(dev_VK,VK,XCOLsizeof(double),cudaMemcpyHostToDevice);
KernelV<<<50,300>>>(dev_XMatrix,dev_V,dev_VK);
cudaMemcpy(V,dev_V,XROWXROW*sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(dev_VK);
cudaFree(dev_XMatrix);
double VKK=(double)malloc(sizeof(double)XCOLXCOL);
for(int i=0;i<XCOLXCOL;i++)
{
VKK[i]=1e-30;
}
int ZZ=(int)malloc(sizeof(int)XROW(XCOL(XCOL-1)/2));
int uu=0;
for(int i=0;i<(XCOL-1);i++)
{
for(int j=i+1;j<XCOL;j++)
{
for(int k=0;k<XROW;k++)
{
uu=((2XCOLi-i*i-i)*XROW/2)+(j-i-1)*XROW+k;
ZZ[uu]=XMatrix[i+k*XCOL]XMatrix[j+kXCOL];
}
}
}
double dev_VKK;
int dev_ZZ;
cudaMalloc((void)&dev_VKK,XCOLXCOLsizeof(double));
cudaMalloc((void**)&dev_ZZ,sizeof(int)XROW(XCOL*(XCOL-1)/2));
cudaMemcpy(dev_V,V,XROWXROWsizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(dev_VKK,VKK,XCOLXCOLsizeof(double),cudaMemcpyHostToDevice);
cudaMemcpy(dev_ZZ,ZZ,sizeof(int)XROW(XCOL*(XCOL-1)/2),cudaMemcpyHostToDevice);
KernelV2<<<50,300>>>(dev_ZZ,dev_V,dev_VKK);
cudaMemcpy(V,dev_V,XROWXROWsizeof(double),cudaMemcpyDeviceToHost);
cudaFree(dev_ZZ);
cudaFree(dev_VKK);
cudaFree(dev_V);
for(int i=0;i<XROW;i++)
{
V[i*XROW+i]=V[i*XROW+i]+v0;
}
double VI=(double)malloc(sizeof(double)XROWXROW);
for(long int i=0;i<XROW*XROW;i++)
{
VI[i]=V[i];
}
t2=clock(); //设定计时器变量t2开始计时
inverse(VI,XROW);
t3=clock(); //设定计时器变量t3开始计时
int iter=0;
double err=1000;
double R[XROW];
FILE p1;
p1=fopen(“iter_simulat.txt”,“w”);
fprintf(p1,“iter error b resid_var\n”);
cudaMemcpy(dev_XMatrix,XMatrix,XSIZE,cudaMemcpyHostToDevice);
while ((iter<iter_max) && (err>err_max)) //start the big loop
{
t4=clock(); //设定计时器变量t4开始计时
iter=iter+1;
double VK0[XCOL];
for(int i=0;i<XCOL;i++)
{
VK0[i]=VK[i];
}
double VISUM=0.0;
double VcolSUM[XROW];
double VcolYPhe=0.0;
for(long int i=0;i<XROWXROW;i++)
{
VISUM=VISUM+VI[i];
}
for(int i=0;i<XROW;i++)
{
VcolSUM[i]=0.0;
}
for(int j=0;j<XROW;j++)
{
for(int i=0;i<XROW;i++)
{
VcolSUM[j]=VcolSUM[j]+VI[j+i*XROW];
}
}
for(int i=0;i<XROW;i++)
{
VcolYPhe=VcolYPhe+VcolSUM[i]*YPhe[i];
}
b=(1/VISUM)*VcolYPhe; //update the fixed effects
//printf(“%.7lf\n”,b);
double s0=v0;
for(int i=0;i<XROW;i++)
{
R[i]=YPhe[i]-b;
}
double RTVI[XROW];
double RTVIRSUM=0.0;
for(int i=0;i<XROW;i++)
{
RTVI[i]=0.0;
}
for(int i=0;i<XROW;i++)
{
for(int j=0;j<XROW;j++)
{
RTVI[i]=RTVI[i]+R[j]*VI[i+XROW*j];
}
}
for(int i=0;i<XROW;i++)
{
RTVIRSUM=RTVIRSUM+RTVI[i]R[i];
}
v0=(s0RTVIRSUM)/XROW;
for(int i=0;i<XROW;i++)
{
V[i*XROW+i]=V[i*XROW+i]+(v0-s0);
}
for(int i=0;i<XROW*XROW;i++)
{
VI[i]=V[i];
}
inverse(VI,XROW); //update the residual variance
t5=clock(); //设定计时器变量t5开始计时
cudaMemcpy(dev_V,V,XROWXROWsizeof(double),cudaMemcpyHostToDevice);
for(int j=0;j<XCOL;j++) //start update main effect QTL variance
{
for(int ione=0;ione<XROWXROW;ione++)
{
VItemp1[ione]=VI[ione];
}
s0=VK[j];
double ZVZ=0.0;
double YVZ=0.0;
double ZTVI[XROW];
for(int i=0;i<XROW;i++)
{
ZTVI[i]=0.0;
}
for(int i=0;i<XROW;i++)
{
for(int k=0;k<XROW;k++)
{
ZTVI[i]=ZTVI[i]+XMatrix[j+kXCOL]*VI[i+k*XROW];
}
}
for(int i=0;i<XROW;i++)
{
ZVZ=ZVZ+ZTVI[i]XMatrix[j+iXCOL];
}
for(int i=0;i<XROW;i++)
{
RTVI[i]=0.0;
}
for(int i=0;i<XROW;i++)
{
for(int k=0;k<XROW;k++)
{
RTVI[i]=RTVI[i]+R[k]*VI[i+k*XROW];
}
}
for(int i=0;i<XROW;i++)
{
YVZ=YVZ+RTVI[i]XMatrix[j+iXCOL];
}
double dum=0.0; //using explicit algorithm
dum=Maxmium((YVZYVZ-ZVZ)/(ZVZZVZ)+s0,1e-30);
VK[j]=dum;
/for(int i=0;i<XROW;i++)
{
for(int k=0;k<XROW;k++)
{
V[i*XROW+k]=V[i*XROW+k]+XMatrix[i*XCOL+j]XMatrix[kXCOL+j](VK[j]-s0);
}
}*/
KernelVOne<<<50,300>>>(dev_V,VK[j],dev_XMatrix,j,s0);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching KernelV11!\n”, cudaStatus);
}
cudaMemcpy(V,dev_V,XROWXROWsizeof(double),cudaMemcpyDeviceToHost);
for(int i=0;i<XROW*XROW;i++)
{
VI[i]=V[i];
}
if((VK[j]-s0)!=0)
{
inverse(VI,XROW);
for(int ione=0;ione<XROWXROW;ione++)
{
VItemp1[ione]=VI[ione];
}
}
else
{
for(int ione=0;ione<XROWXROW;ione++)
{
VI[ione]=VItemp1[ione];
}
}
} //end update main effect QTL variance
t6=clock(); //设定计时器变量t6开始计时
for(int i=0;i<XCOL-1;i++) //start update epistatic effect QTL variance
{
for(int j=i+1;j<XCOL;j++)
{
for(int itwo=0;itwo<XROWXROW;itwo++)
{
VItemp2[itwo]=VI[itwo];
}
s0=VKK[j+iXCOL];
double ZVZ=0.0;
double YVZ=0.0;
double ZTVI[XROW];
for(int n=0;n<XROW;n++)
{
ZTVI[n]=0.0;
}
for(int k=0;k<XROW;k++)
{
for(int n=0;n<XROW;n++)
{
ZTVI[k]=ZTVI[k]+ZZ[((2XCOLi-i*i-i)XROW/2)+(j-i-1)XROW+n]VI[k+nXROW];
}
}
for(int n=0;n<XROW;n++)
{
ZVZ=ZVZ+ZTVI[n]ZZ[((2XCOLi-ii-i)*XROW/2)+(j-i-1)*XROW+n];
}
for(int n=0;n<XROW;n++)
{
RTVI[n]=0.0;
}
for(int k=0;k<XROW;k++)
{
for(int n=0;n<XROW;n++)
{
RTVI[k]=RTVI[k]+R[n]VI[k+nXROW];
}
}
for(int n=0;n<XROW;n++)
{
YVZ=YVZ+RTVI[n]ZZ[((2XCOLi-ii-i)*XROW/2)+(j-i-1)*XROW+n];
}
double dum=0.0; //using explicit algorithm
dum=Maxmium((YVZYVZ-ZVZ)/(ZVZZVZ)+s0,1e-30);
VKK[j+iXCOL]=dum;
for(int n=0;n<XROW;n++)
{
for(int k=0;k<XROW;k++)
{
V[nXROW+k]=V[nXROW+k]+ZZ[((2XCOLi-ii-i)XROW/2)+(j-i-1)XROW+n]ZZ[((2XCOLi-ii-i)XROW/2)+(j-i-1)XROW+k](VKK[j+iXCOL]-s0);
}
}
for(int n=0;n<XROWXROW;n++)
{
VI[n]=V[n];
}
if((VKK[j+iXCOL]-s0)!=0)
{
inverse(VI,XROW);
for(int itwo=0;itwo<XROWXROW;itwo++)
{
VItemp2[itwo]=VI[itwo];
}
}
else
{
for(int itwo=0;itwo<XROWXROW;itwo++)
{
VI[itwo]=VItemp2[itwo];
}
}
}
} //end update epistatic effect QTL variance
t7=clock(); //设定计时器变量t7开始计时
double v01=v0;
double VK1[XCOL];
for(int n=0;n<XCOL;n++)
{
VK1[n]=VK[n];
}
err=0.0;
for(int n=0;n<XCOL;n++)
{
err=err+(VK0[n]-VK1[n])*(VK0[n]-VK1[n]);
}
err=err/XCOL;
// printf(“%2d %.10lf %.7lf %.7lf\n”,iter,err,b,v01);
fprintf(p1,“%2d %e %e %e\n”,iter,err,b,v01);
}
//end the big loop
}
[/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i]