加一个问题描述:N个粒子相互作用,很简单的只考虑万有引力。。
问题是:我的times为1时,两个结果完全正确。。
当我的times不为1时,我跟踪第一个线程,输出所有的加速度,发现在计算第16个粒子的时候和CPU就不同了。。
会不会是两个程序不配套??还是变量问题??
这是kernel程序,
__device__ float3 bodyBodyInteraction(float4 bi, float4 bj, float3 ai)
{
float3 r;
// r_ij [3 FLOPS]
r.x = bj.x - bi.x;
r.y = bj.y - bi.y;
r.z = bj.z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS;
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float distSixth = distSqr * distSqr * distSqr;
float invDistCube = 1.0f/sqrtf(distSixth);
// s = m_j * invDistCube [1 FLOP]
float s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
__device__ float3 calculate(float4 a,float4* b,float3 accl)
{
int i;
for(i=0;i<N;i++) //循环N次,挨个计算N个粒子对给定粒子的作用
{
accl =bodyBodyInteraction(a,b[i],accl); //就是计算力
if(0==threadIdx.x&&0==blockIdx.x)//&&N-1==i)//&&1==times)
{
//printf("GPU:pos[%d]:%f--%f--%f\n",index,pos[index].x,pos[index].y,pos[index].z);
printf("GPU:acc[%d]:%f--%f--%f\n",i,accl.x,accl.y,accl.z);
}
}
return accl;
}
__global__ static void GPU(float4* pos,float4* vel,int number)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
const int index=bid*blockDim.x+tid;
float3 acc[N] = {0.0f, 0.0f, 0.0f};
int times;
for(times=0;times<number;times++) //控制跑的步数
{
float4 atom=pos[index];
acc[index]=calculate(atom,pos,acc[index]); //计算其他N个粒子对和线程对应的粒子的力
vel[index].x +=(float)( acc[index].x * deltaTime);
vel[index].y +=(float)( acc[index].y * deltaTime);
vel[index].z +=(float)( acc[index].z * deltaTime);
vel[index].x *= damping;
vel[index].y *= damping;
vel[index].z *= damping;
pos[index].x +=(float)( vel[index].x * deltaTime);
pos[index].y +=(float)( vel[index].y * deltaTime);
pos[index].z +=(float)( vel[index].z * deltaTime);
__syncthreads();//wait until all the thread have finished this times calculate
//if(0==threadIdx.x&&0==blockIdx.x&×==1)
//{
// //printf("acc[%d]:%f--%f--%f\n",times,acc[index].x,acc[index].y,acc[index].z);
// int k;
// for(k=0;k<N;k++)
// {
// printf("pos[%d]:%f--%f--%f\n",k,pos[k].x,pos[k].y,pos[k].z);
// }
//}
}
}
以下是CPU程序
void CPU(float4* pos,float4* vel,int number)
{
int i,j,times;
float3 acc[N];
for(times=0;times<number;times++)
{
for(i=0;i<N;i++)
{
if(0==times)
{ acc[i].x=0.0;acc[i].y=0.0;acc[i].z=0.0;}
float4 bi=pos[i];
for(j=0;j<N;j++)
{
float3 r;
// r_ij [3 FLOPS]
r.x = pos[j].x - bi.x;
r.y = pos[j].y - bi.y;
r.z = pos[j].z - bi.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS;
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float distSixth = distSqr * distSqr * distSqr;
float invDistCube = 1.0f/sqrtf(distSixth);
// s = m_j * invDistCube [1 FLOP]
float s = vel[j].w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
acc[i].x += r.x * s;
acc[i].y += r.y * s;
acc[i].z += r.z * s;
if(0==i&&1==times)//&&N-1==j)
{
//printf("CPU:pos[%d]:%f--%f--%f\n",i,pos[i].x,pos[i].y,pos[i].z);
printf("CPU:acc[%d]:%f--%f--%f\n",j,acc[i].x,acc[i].y,acc[i].z);
}
}
vel[i].x += acc[i].x * deltaTime;
vel[i].y += acc[i].y * deltaTime;
vel[i].z += acc[i].z * deltaTime;
vel[i].x *= damping;
vel[i].y *= damping;
vel[i].z *= damping;
// new position = old position + velocity * deltaTime
pos[i].x += vel[i].x * deltaTime;
pos[i].y += vel[i].y * deltaTime;
pos[i].z += vel[i].z * deltaTime;
// if(0==i&&1==times)
//{
// //printf("acc[%d]:%f--%f--%f\n",times,acc[i].x,acc[i].y,acc[i].z);
// char after_CPU[81] = "E:\\after1_CPU.txt";
// write_to_file(after_CPU, pos, N);
//}
}
}
}
下面是我加入测试输出函数后的输出截图。。
[attach]663907[/attach] [attach]663908[/attach]
[ 本帖最后由 hnuzhoulin 于 2010-5-11 15:26 编辑 ]