多个block在kernel里面循环时数据出错

加一个问题描述:N个粒子相互作用,很简单的只考虑万有引力。。

问题是:我的times为1时,两个结果完全正确。。
当我的times不为1时,我跟踪第一个线程,输出所有的加速度,发现在计算第16个粒子的时候和CPU就不同了。。

会不会是两个程序不配套??还是变量问题??

这是kernel程序,


__device__ float3 bodyBodyInteraction(float4 bi, float4 bj, float3 ai)
{
	float3 r;
	
	// r_ij [3 FLOPS]
	r.x = bj.x - bi.x;
	r.y = bj.y - bi.y;
	r.z = bj.z - bi.z;
	
	// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
	float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS;
	// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
	
	float distSixth = distSqr * distSqr * distSqr;
	float invDistCube = 1.0f/sqrtf(distSixth);
	
	// s = m_j * invDistCube [1 FLOP]
	float s = bj.w * invDistCube;
	
	// a_i = a_i + s * r_ij [6 FLOPS]
	ai.x += r.x * s;
	ai.y += r.y * s;
	ai.z += r.z * s;
	return ai;
}

__device__ float3 calculate(float4 a,float4* b,float3 accl)
{
	int i;
	for(i=0;i<N;i++)   //循环N次,挨个计算N个粒子对给定粒子的作用
	{
		accl =bodyBodyInteraction(a,b[i],accl);  //就是计算力
					if(0==threadIdx.x&&0==blockIdx.x)//&&N-1==i)//&&1==times)
			{
				//printf("GPU:pos[%d]:%f--%f--%f\n",index,pos[index].x,pos[index].y,pos[index].z);
			    printf("GPU:acc[%d]:%f--%f--%f\n",i,accl.x,accl.y,accl.z);
			}
	}
	return accl;
}

__global__ static void GPU(float4* pos,float4* vel,int number)
{
	const int tid=threadIdx.x;
	const int bid=blockIdx.x;
	const int index=bid*blockDim.x+tid;
	
   float3 acc[N] = {0.0f, 0.0f, 0.0f};
	int times;

	for(times=0;times<number;times++)  //控制跑的步数
	{

		float4 atom=pos[index];

		acc[index]=calculate(atom,pos,acc[index]); //计算其他N个粒子对和线程对应的粒子的力
	      
		vel[index].x +=(float)( acc[index].x * deltaTime);
		vel[index].y +=(float)( acc[index].y * deltaTime);
		vel[index].z +=(float)( acc[index].z * deltaTime); 

		vel[index].x *= damping;
		vel[index].y *= damping;
		vel[index].z *= damping;
	        
		pos[index].x +=(float)( vel[index].x * deltaTime);
		pos[index].y +=(float)( vel[index].y * deltaTime);
		pos[index].z +=(float)( vel[index].z * deltaTime);

		__syncthreads();//wait until all the thread have finished this times calculate

	//if(0==threadIdx.x&&0==blockIdx.x&&times==1)
	//{
	//	//printf("acc[%d]:%f--%f--%f\n",times,acc[index].x,acc[index].y,acc[index].z);
	//	int k;
	//	for(k=0;k<N;k++)
	//	{
	//		printf("pos[%d]:%f--%f--%f\n",k,pos[k].x,pos[k].y,pos[k].z);
	//	}
	//}
	}
}

以下是CPU程序

void CPU(float4* pos,float4* vel,int number)
{
   int i,j,times;
   float3 acc[N];
   for(times=0;times<number;times++)
   {
	   for(i=0;i<N;i++)
	   { 
			if(0==times)
			{		acc[i].x=0.0;acc[i].y=0.0;acc[i].z=0.0;}
		   
			float4 bi=pos[i];

			for(j=0;j<N;j++)
			{
				float3 r;

				// r_ij [3 FLOPS]
				r.x = pos[j].x - bi.x;
				r.y = pos[j].y - bi.y;
				r.z = pos[j].z - bi.z;
				
				// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
				float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS;
				// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
				
				float distSixth = distSqr * distSqr * distSqr;
				float invDistCube = 1.0f/sqrtf(distSixth);
				
				// s = m_j * invDistCube [1 FLOP]
				float s = vel[j].w * invDistCube;
				
				// a_i = a_i + s * r_ij [6 FLOPS]
				acc[i].x += r.x * s;
				acc[i].y += r.y * s;
				acc[i].z += r.z * s;
							if(0==i&&1==times)//&&N-1==j)
			{
				//printf("CPU:pos[%d]:%f--%f--%f\n",i,pos[i].x,pos[i].y,pos[i].z);
			    printf("CPU:acc[%d]:%f--%f--%f\n",j,acc[i].x,acc[i].y,acc[i].z);
			}
			  }
		      
			vel[i].x += acc[i].x * deltaTime;
			vel[i].y += acc[i].y * deltaTime;
			vel[i].z += acc[i].z * deltaTime;  

			vel[i].x *= damping;
			vel[i].y *= damping;
			vel[i].z *= damping;
		        
			// new position = old position + velocity * deltaTime
			pos[i].x += vel[i].x * deltaTime;
			pos[i].y += vel[i].y * deltaTime;
			pos[i].z += vel[i].z * deltaTime;

		//			if(0==i&&1==times)
		//{
		//	//printf("acc[%d]:%f--%f--%f\n",times,acc[i].x,acc[i].y,acc[i].z);
		//	char after_CPU[81] = "E:\\after1_CPU.txt";
	 //       write_to_file(after_CPU, pos, N);
		//}
	   }
   }
}

下面是我加入测试输出函数后的输出截图。。

[attach]663907[/attach] [attach]663908[/attach]

[ 本帖最后由 hnuzhoulin 于 2010-5-11 15:26 编辑 ]