kernel部分 没有计算啊

代码如下:
……
#define NUM_POINT (21887)
#define NUM_POINT_NOT_GRAPH (21687)
#define NUM_POINT_GRAPH (200)
global void solveKernel(float* v,int* vng,int* vv,int* id,float* fx,float* vnew)
{
unsigned int i=blockIdx.x512+threadIdx.x;
int i_v=vng【i】;
float pos[3];
pos[0]=v[i_v*3+0];
pos[1]=v[i_v*3+1];
pos[2]=v[i_v*3+2];
float v0=0.0;
float v1=0.0;
float v2=0.0;
float posD[3];
float w[3];
float k1[3];
int num2=vv[i*4+3];
int g_i2=id[num2];
k1[0]=v[g_i2
3+0];
k1[1]=v[g_i23+1];
k1[2]=v[g_i2
3+2];
float sum=0.0;
float dmax=(pos[0]-k1[0])(pos[0]-k1[0])+(pos[1]-k1[1])(pos[1]-k1[1])+(pos[2]-k1[2])(pos[2]-k1[2]);
for(int j=0;j<3;j++)
{
int num=vv[i*4+j];
int g_i=id[num];
posD[0]=v[g_i
3+0];
posD[1]=v[g_i3+1];
posD[2]=v[g_i
3+2];
float d=(pos[0]-posD[0])(pos[0]-posD[0])+(pos[1]-posD[1])(pos[1]-posD[1])+(pos[2]-posD[2])(pos[2]-posD[2]);
w[j]=(sqrt(dmax)-sqrt(d))
(sqrt(dmax)-sqrt(d))/dmax;
sum+=w[j];
}
for(int j=0;j<3;j++)
{
w[j]=w[j]/sum;
}
for(int j=0;j<3;j++)
{
int num=vv[i*4+j];
int g_i=id[num];
posD[0]=v[g_i3+0];
posD[1]=v[g_i
3+1];
posD[2]=v[g_i3+2];
v0+=w[j]
(fx[num12+0](pos[0]-posD[0])+fx[num12+1](pos[1]-posD[1])+fx[num12+2](pos[2]-posD[2])+posD[0]+fx[num12+9]);
v1+=w[j]
(fx[num12+3](pos[0]-posD[0])+fx[num12+4](pos[1]-posD[1])+fx[num12+5](pos[2]-posD[2])+posD[1]+fx[num12+10]);
v2+=w[j]
(fx[num12+6](pos[0]-posD[0])+fx[num12+7](pos[1]-posD[1])+fx[num12+8](pos[2]-posD[2])+posD[2]+fx[num12+11]);
}
vnew[i_v*3+0]=v0;
vnew[i_v*3+1]=v1;
vnew[i_v*3+2]=v2;
}
int main(){
float
d_v;
cudaMalloc((void**)&d_v,NUM_POINT3sizeof(float));
int* d_vng;
cudaMalloc((void**)&d_vng,NUM_POINT_NOT_GRAPHsizeof(int));
int
d_vv;
cudaMalloc((void**)&d_vv,NUM_POINT_NOT_GRAPH4sizeof(int));
int* d_id;
cudaMalloc((void**)&d_id,NUM_POINT_GRAPHsizeof(int));
float
d_fx;
cudaMalloc((void**)&d_fx,NUM_POINT_GRAPH12sizeof(float));
float* d_vnew;
cudaMalloc((void**)&d_vnew,NUM_POINT3sizeof(float));
float* vnew=(float*)malloc(NUM_POINT3sizeof(float));
cudaMemcpy(d_v,v,NUM_POINT3sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_vng,vng,NUM_POINT_NOT_GRAPHsizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_vv,vv,NUM_POINT_NOT_GRAPH
4sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_id,id,NUM_POINT_GRAPH
sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_fx,fx,NUM_POINT_GRAPH12sizeof(float),cudaMemcpyHostToDevice);
int dimBlock=512;
int dimGrid=(NUM_POINT_NOT_GRAPH/512)+1;
solveKernel<<<dimGrid,dimBlock>>>(d_v,d_vng,d_vv,d_id,d_fx,d_vnew);
cudaMemcpy(vnew,d_vnew,NUM_POINT3sizeof(float),cudaMemcpyDeviceToHost);
ofstream out(“vnew.txt”);
for(int i=0;i<NUM_POINT*3;i++)
{
out<<vnew[i]<<‘,’<<endl;
}
cudaFree(d_v);
cudaFree(d_vng);
cudaFree(d_vv);
cudaFree(d_id);
cudaFree(d_fx);
cudaFree(d_vnew);
free(vnew);
return 0;
}
kernel部分计算的不对啊,输出的vnew数组都是错误的。求教大家哪里出了问题!谢谢了

[ 本帖最后由 tricial2010 于 2010-7-8 10:18 编辑 ]