{
float R0, R1, R2, R3, R4, R5, R6, R7, R8,
B0, B1, B2, B3, B4, B5, B6, B7, B8;
float RHO_R, RHO_B, RHO_N, RHO, VX, VY, NX, NY, forcex, forcey;//FNORM
//int x, y;
int WALLS,FLAG;
float nu, su;
float mi[Q], meq[Q],fk[Q];
//shared memeory for propagation with east and west part
//red fluid
__shared__ float F_R1[THREAD_NUM];
__shared__ float F_R5[THREAD_NUM];
__shared__ float F_R8[THREAD_NUM];
__shared__ float F_R3[THREAD_NUM];
__shared__ float F_R6[THREAD_NUM];
__shared__ float F_R7[THREAD_NUM];
//blue fluid
__shared__ float F_B1[THREAD_NUM];
__shared__ float F_B5[THREAD_NUM];
__shared__ float F_B8[THREAD_NUM];
__shared__ float F_B3[THREAD_NUM];
__shared__ float F_B6[THREAD_NUM];
__shared__ float F_B7[THREAD_NUM];
//index
int tx = threadIdx.x;
int x = blockIdx.x * THREAD_NUM + tx;
int y = blockIdx.y;
int k = y * xl + x;
//Load data
R0 = r0old[k]; R1 = r1old[k], R2 = r2old[k], R3 = r3old[k], R4 = r4old[k], R5 = r5old[k], R6 = r6old[k], R7 = r7old[k], R8 = r8old[k];
B0 = b0old[k]; B1 = b1old[k], B2 = b2old[k], B3 = b3old[k], B4 = b4old[k], B5 = b5old[k], B6 = b6old[k], B7 = b7old[k], B8 = b8old[k];
NX = nx[k], NY = ny[k];
WALLS = walls[k], FLAG = flag[k];
forcex = Forcex[k], forcey = Forcey[k];//FNORM = fnorm[k],
if (WALLS <= 0)
{
//Project
//计算宏观量
VX = B1 + R1 + B5 + R5 + B8 + R8 - B3 - R3 - B6 - R6 - B7 - R7;
VY = B2 + R2 + B5 + R5 + B6 + R6 - B4 - R4 - B7 - R7 - B8 - R8;
RHO_R = R0 + R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8;
RHO_B = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 + B8;
RHO = RHO_B + RHO_R;
RHO_N = (RHO_R - RHO_B) / RHO;
//method 1
nu = 2.0 / ((1.0 + RHO_N) / nu_R + (1.0 - RHO_N) / nu_B);
su = 1.0 / (3 * nu + 0.5);
float si[Q] = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,su,su };
//外力对速度的作用以及准三维修正
VX = (VX + 0.5*forcex) / (RHO + nu / (2 * kk));
VY = (VY + 0.5*forcey) / (RHO + nu / (2 * kk));
for (int i = 0; i < Q; i++)
mi[i] = M[i][0] * (B0 + R0) + M[i][1] * (B1 + R1) + M[i][2] * (B2 + R2) + M[i][3] * (B3 + R3) +
M[i][4] * (B4 + R4) + M[i][5] * (B5 + R5) + M[i][6] * (B6 + R6) + M[i][7] * (B7 + R7) + M[i][8] * (B8 + R8);
forcex -= nu / kk * VX; //准三维修正项,debug *VX
forcey -= nu / kk * VY;
//M*feq
meq[0] = RHO;
meq[1] = 3 * RHO*(VX*VX + VY * VY) - 2 * RHO;
meq[2] = RHO - 3 * RHO*(VX*VX + VY * VY);
meq[3] = RHO * VX;
meq[4] = (-1.0)*RHO * VX;
meq[5] = RHO * VY;
meq[6] = (-1.0)*RHO * VY;
meq[7] = RHO * (VX*VX - VY * VY);
meq[8] = RHO * VX*VY;
//M*F
fk[0] = 0;
fk[1] = 6 * (VX*forcex + VY * forcey);
fk[2] = (-6) * (VX*forcex + VY * forcey);
fk[3] = forcex;
fk[4] = (-1) * forcex;
fk[5] = forcey;
fk[6] = (-1) * forcey;
fk[7] = 2 * (VX*forcex - VY * forcey);
fk[8] = VX * forcey + VY * forcex; //found a bug ux*fy+uy*fx
for (int i = 0; i < Q; i++)
mi[i] = mi[i] - si[i] * (mi[i] - meq[i]) + (1 - 0.5*si[i])*fk[i];
//fk代替为下一时刻分布函数
for (int i = 0; i < Q; i++)
{
fk[i] = 0;
for (int j = 0; j < Q; j++)
fk[i] += M_inv[i*Q + j] * mi[j];
}
//Recolor
R0 = RHO_R / RHO * fk[0];
R1 = RHO_R / RHO * fk[1];
R2 = RHO_R / RHO * fk[2];
R3 = RHO_R / RHO * fk[3];
R4 = RHO_R / RHO * fk[4];
R5 = RHO_R / RHO * fk[5];
R6 = RHO_R / RHO * fk[6];
R7 = RHO_R / RHO * fk[7];
R8 = RHO_R / RHO * fk[8];
if (FLAG) //两相界面
{
R1 = R1 + RHO_R * RHO_B / RHO * beta*w[1] * NX;
R2 = R2 + RHO_R * RHO_B / RHO * beta*w[2] * NY;
R3 = R3 + RHO_R * RHO_B / RHO * beta*w[3] * (NX * ix[3]);
R4 = R4 + RHO_R * RHO_B / RHO * beta*w[4] * (NY * iy[4]);
R5 = R5 + RHO_R * RHO_B / RHO * beta*w[5] * (NX + NY) / sqrt(2.0);
R6 = R6 + RHO_R * RHO_B / RHO * beta*w[6] * (NY - NX) / sqrt(2.0);
R7 = R7 + RHO_R * RHO_B / RHO * beta*w[7] * (-1.0)*(NX + NY) / sqrt(2.0);
R8 = R8 + RHO_R * RHO_B / RHO * beta*w[8] * (NX - NY) / sqrt(2.0);
}
B0 = fk[0] - R0;
B1 = fk[1] - R1;
B2 = fk[2] - R2;
B3 = fk[3] - R3;
B4 = fk[4] - R4;
B5 = fk[5] - R5;
B6 = fk[6] - R6;
B7 = fk[7] - R7;
B8 = fk[8] - R8;
}
else
{
float temp;
temp = R1; R1 = R3; R3 = temp;//red fluid
temp = R2; R2 = R4; R4 = temp;
temp = R5; R5 = R7; R7 = temp;
temp = R6; R6 = R8; R8 = temp;
temp = B1; B1 = B3; B3 = temp;//blue fluid
temp = B2; B2 = B4; B4 = temp;
temp = B5; B5 = B7; B7 = temp;
temp = B6; B6 = B8; B8 = temp;
}
//左右迁移
//write back data
//字数不够了,后面应该没问题
}
代码如上。用visual profiler分析发现这个kernel函数用的寄存器数量太对导致占有率很低,限制寄存器数量之后速度加快一倍,但是global memory load速度变慢为50%,请帮忙分析以下该怎么优化啊