函数速度太慢,哪位大佬帮忙看看怎么优化

{

float R0, R1, R2, R3, R4, R5, R6, R7, R8,
	B0, B1, B2, B3, B4, B5, B6, B7, B8;
float RHO_R, RHO_B, RHO_N, RHO, VX, VY, NX, NY, forcex, forcey;//FNORM
//int x, y;
int WALLS,FLAG;
float nu, su;
float mi[Q], meq[Q],fk[Q];
//shared memeory for propagation with east and west part
//red fluid
__shared__ float F_R1[THREAD_NUM];
__shared__ float F_R5[THREAD_NUM];
__shared__ float F_R8[THREAD_NUM];
__shared__ float F_R3[THREAD_NUM];
__shared__ float F_R6[THREAD_NUM];
__shared__ float F_R7[THREAD_NUM];
//blue fluid
__shared__ float F_B1[THREAD_NUM];
__shared__ float F_B5[THREAD_NUM];
__shared__ float F_B8[THREAD_NUM];
__shared__ float F_B3[THREAD_NUM];
__shared__ float F_B6[THREAD_NUM];
__shared__ float F_B7[THREAD_NUM];
//index
int tx = threadIdx.x;
int x = blockIdx.x * THREAD_NUM + tx;
int y = blockIdx.y;
int k = y * xl + x;
//Load data 
R0 = r0old[k]; R1 = r1old[k], R2 = r2old[k], R3 = r3old[k], R4 = r4old[k], R5 = r5old[k], R6 = r6old[k], R7 = r7old[k], R8 = r8old[k];
B0 = b0old[k]; B1 = b1old[k], B2 = b2old[k], B3 = b3old[k], B4 = b4old[k], B5 = b5old[k], B6 = b6old[k], B7 = b7old[k], B8 = b8old[k];
NX = nx[k], NY = ny[k];
WALLS = walls[k], FLAG = flag[k];
forcex = Forcex[k], forcey = Forcey[k];//FNORM = fnorm[k],
if (WALLS <= 0)
{
	//Project
	//计算宏观量		
	VX = B1 + R1 + B5 + R5 + B8 + R8 - B3 - R3 - B6 - R6 - B7 - R7;
	VY = B2 + R2 + B5 + R5 + B6 + R6 - B4 - R4 - B7 - R7 - B8 - R8;
	RHO_R = R0 + R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8;
	RHO_B = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7 + B8;
	RHO = RHO_B + RHO_R;
	RHO_N = (RHO_R - RHO_B) / RHO;	
	//method 1
	nu = 2.0 / ((1.0 + RHO_N) / nu_R + (1.0 - RHO_N) / nu_B);
	su = 1.0 / (3 * nu + 0.5);
	float si[Q] = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,su,su };
	//外力对速度的作用以及准三维修正
	VX = (VX + 0.5*forcex) / (RHO + nu / (2 * kk));
	VY = (VY + 0.5*forcey) / (RHO + nu / (2 * kk));
	for (int i = 0; i < Q; i++)
		mi[i] = M[i][0] * (B0 + R0) + M[i][1] * (B1 + R1) + M[i][2] * (B2 + R2) + M[i][3] * (B3 + R3) +
		        M[i][4] * (B4 + R4) + M[i][5] * (B5 + R5) + M[i][6] * (B6 + R6) + M[i][7] * (B7 + R7) + M[i][8] * (B8 + R8);
	forcex -= nu / kk * VX;   //准三维修正项,debug *VX
	forcey -= nu / kk * VY;
	//M*feq
	meq[0] = RHO;
	meq[1] = 3 * RHO*(VX*VX + VY * VY) - 2 * RHO;
	meq[2] = RHO - 3 * RHO*(VX*VX + VY * VY);
	meq[3] = RHO * VX;
	meq[4] = (-1.0)*RHO * VX;
	meq[5] = RHO * VY;
	meq[6] = (-1.0)*RHO * VY;
	meq[7] = RHO * (VX*VX - VY * VY);
	meq[8] = RHO * VX*VY;
	//M*F
	fk[0] = 0;
	fk[1] = 6 * (VX*forcex + VY * forcey);
	fk[2] = (-6) * (VX*forcex + VY * forcey);
	fk[3] = forcex;
	fk[4] = (-1) * forcex;
	fk[5] = forcey;
	fk[6] = (-1) * forcey;
	fk[7] = 2 * (VX*forcex - VY * forcey);
	fk[8] = VX * forcey + VY * forcex;         //found a bug  ux*fy+uy*fx
	for (int i = 0; i < Q; i++)
		mi[i] = mi[i] - si[i] * (mi[i] - meq[i]) + (1 - 0.5*si[i])*fk[i];
	//fk代替为下一时刻分布函数
	for (int i = 0; i < Q; i++)
	{
		fk[i] = 0;
		for (int j = 0; j < Q; j++)
			fk[i] += M_inv[i*Q + j] * mi[j];
	}
	//Recolor
	R0 = RHO_R / RHO * fk[0];
	R1 = RHO_R / RHO * fk[1];
	R2 = RHO_R / RHO * fk[2];
	R3 = RHO_R / RHO * fk[3];
	R4 = RHO_R / RHO * fk[4];
	R5 = RHO_R / RHO * fk[5];
	R6 = RHO_R / RHO * fk[6];
	R7 = RHO_R / RHO * fk[7];
	R8 = RHO_R / RHO * fk[8];
	if (FLAG)             //两相界面
	{
		R1 = R1 + RHO_R * RHO_B / RHO * beta*w[1] * NX;
		R2 = R2 + RHO_R * RHO_B / RHO * beta*w[2] * NY;
		R3 = R3 + RHO_R * RHO_B / RHO * beta*w[3] * (NX * ix[3]);
		R4 = R4 + RHO_R * RHO_B / RHO * beta*w[4] * (NY * iy[4]);
		R5 = R5 + RHO_R * RHO_B / RHO * beta*w[5] * (NX + NY) / sqrt(2.0);
		R6 = R6 + RHO_R * RHO_B / RHO * beta*w[6] * (NY - NX) / sqrt(2.0);
		R7 = R7 + RHO_R * RHO_B / RHO * beta*w[7] * (-1.0)*(NX + NY) / sqrt(2.0);
		R8 = R8 + RHO_R * RHO_B / RHO * beta*w[8] * (NX - NY) / sqrt(2.0);
	}
	B0 = fk[0] - R0;
	B1 = fk[1] - R1;
	B2 = fk[2] - R2;
	B3 = fk[3] - R3;
	B4 = fk[4] - R4;
	B5 = fk[5] - R5;
	B6 = fk[6] - R6;
	B7 = fk[7] - R7;
	B8 = fk[8] - R8;
}
else
{
	float temp;
	temp = R1; R1 = R3; R3 = temp;//red fluid
	temp = R2; R2 = R4; R4 = temp;
	temp = R5; R5 = R7; R7 = temp;
	temp = R6; R6 = R8; R8 = temp;


	temp = B1; B1 = B3; B3 = temp;//blue fluid
	temp = B2; B2 = B4; B4 = temp;
	temp = B5; B5 = B7; B7 = temp;
	temp = B6; B6 = B8; B8 = temp;
}
//左右迁移
//write back data 
//字数不够了,后面应该没问题

}
代码如上。用visual profiler分析发现这个kernel函数用的寄存器数量太对导致占有率很低,限制寄存器数量之后速度加快一倍,但是global memory load速度变慢为50%,请帮忙分析以下该怎么优化啊