编译出错，很奇怪的，求教

system · 2010 年3 月 30 日 05:32

这个问题已经解决了，但是又有新问题啊
0x10009825 处未处理的异常: 0xC0000005: 读取位置 0xcdcdcdd1 时发生访问冲突
这个一般是什么原因啊？？出现在分配显存的时候

错误	14	error LNK2005: __Z15integrateBodiesP6float4S0_Pl 已经在 myfirst.obj 中定义	myfirst_kernel.obj	
错误	15	error LNK2005: ___device_stub__Z15integrateBodiesP6float4S0_Pl 已经在 myfirst.obj 中定义	myfirst_kernel.obj	
错误	16	error LNK2005: "void __cdecl integrateBodies__entry(struct float4 *,struct float4 *,long *)" (?integrateBodies__entry@@YAXPAUfloat4@@0PAJ@Z) 已经在 myfirst.obj 中定义	myfirst_kernel.obj	
错误	17	fatal error LNK1169: 找到一个或多个多重定义的符号	C:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\Project2\EmuDebug\Project2.exe

我的原程序如下：

//this is file myfirst.cu 
#include "myfirst_kernel.cu"
#include "myfirst.h"
//#include <cutil_inline.h>
#include "cutil.h"
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
//added 2010.3.22
#include <string.h>
#include <math.h>
extern "C"
{
   int main( int argc,char** argv)  //the main program added by zhoulin 2010.3.4
   {
	CUT_DEVICE_INIT(argc, argv);
	
	// 4 floats each for alignment reasons
	unsigned int memSize = sizeof( float4) * numBodies;

	//host端分配内存
	clock_t * timer=NULL;   //我们这里并非是计算一个block的时间，而是总共的运算时间
	float4* h_pos=(float4*)malloc(numBodies);
	float3* h_acc=(float3*)malloc(numBodies);
	float4* h_vel=(float4*)malloc(numBodies);
	//生成初试数据  bodysystemcpu.cpp
   float alat=1.5496;
   float disp=0.5;
   float rcell[3][4]={0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.0,0.5};
   srand(int(time(NULL)/2));
   for(int k=0;k<8;k++)
   { 
   for(int j=0;j<8;j++)
   {      
   for(int i=0;i<4;i++)
   {    
   for(int L=0;L<4;L++)
   { 
			int index=k*8+j*8+i*4+L*4;
			h_pos[index].x=alat*(i+rcell[1][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
			h_pos[index].y=alat*(i+rcell[2][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
			h_pos[index].z=alat*(i+rcell[3][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
			h_pos[index].w=1.0;
			h_vel[index].x=0.0;
			h_vel[index].y=0.0;
			h_vel[index].z=0.0;
			h_vel[index].w=0.0;
			printf("位置：%f-%f-%f-%f\n",h_pos[index].x,h_pos[index].y,h_pos[index].z);
			printf("速度：%f-%f-%f-%f\n",h_vel[index].x,h_vel[index].y,h_vel[index].z);
   }
   }
   }
   }
	//device端分配内存
   clock_t* dtimer=NULL;
	cudaMalloc((void**)&dtimer, sizeof(clock_t)*16*2);
   float4* d_pos=NULL;
	cudaMalloc((void**)&d_pos, numBodies);
	float3* d_acc=NULL;
	cudaMalloc((void**)&d_acc, numBodies);
	float4* d_vel=NULL;
	cudaMalloc((void**)&d_vel, numBodies);
	//向显存拷入数据
	CUDA_SAFE_CALL(cudaMemcpy(d_pos, h_pos, memSize,cudaMemcpyHostToDevice));
		//运行核函数
	//int sharedMemSize = 3*p * sizeof(float4); // each 4 floats for pos,vel and acc
   dim3 threads(p,1,1);
	dim3 grid(16, 1, 1);
	   // execute the kernel: we set q=1 here.-----zhoulin 2010.3.4
	integrateBodies<<< grid, threads,memSize >>>(d_pos, d_vel,dtimer);
   	// check if kernel invocation generated an error
	 CUT_CHECK_ERROR("Kernel execution failed");
	//将数据拷回主机内存
   CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
   CUDA_SAFE_CALL(cudaMemcpy(h_pos, d_pos, memSize, cudaMemcpyDeviceToHost));
	CUDA_SAFE_CALL(cudaMemcpy(h_vel, d_vel, memSize, cudaMemcpyDeviceToHost));
	//释放存储器
	//cuda_safe_call(cudaFreeHost(h_pos));
   //cuda_safe_call(cudaFreeHost(h_acc));
	//cuda_safe_call(cudaFreeHost(h_vel));
	free(h_pos);
   free(h_acc);
	free(h_vel);
	CUDA_SAFE_CALL(cudaFree(d_pos));
	CUDA_SAFE_CALL(cudaFree(d_acc));
	CUDA_SAFE_CALL(cudaFree(d_vel));
	CUDA_SAFE_CALL(cudaFree(dtimer));
	//时间测试
   clock_t minStart = timer[0];
   clock_t maxEnd = timer[16];
   for (int i = 1; i < 16; i++)
   { 
   minStart = timer[i] < minStart ? timer[i] : minStart; 
   maxEnd = timer[16+i] > maxEnd ? timer[16+i] : maxEnd;
   }
   printf("time = %d\n", maxEnd - minStart);
	CUT_EXIT(argc, argv);  //exit CUDA
   }
}

/* this is the file myfirst_kernel.cu
*/
#ifndef _MYFIRST_KERNEL_H_
#define _MYFIRST_KERNEL_H_
#include <math.h>
#include "myfirst.h"
#define LOOP_UNROLL 4
// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]
__device__ float3 bodyBodyInteraction(float3 ai, float4 bi, float4 bj) 
{
   float3 r;
   // r_ij  [3 FLOPS]
   r.x = bi.x - bj.x;
   r.y = bi.y - bj.y;
   r.z = bi.z - bj.z;
   // distSqr = dot(r_ij, r_ij) + EPS^2  [6 FLOPS]
   float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
   distSqr += eps;
   // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
   float invDist = 1.0f / sqrtf(distSqr);
	float invDistCube =  invDist * invDist * invDist;
   //float distSixth = distSqr * distSqr * distSqr;
   //float invDistCube = 1.0f / sqrtf(distSixth);
   // s = m_j * invDistCube [1 FLOP]
   float s = bj.w * invDistCube;
   // a_i =  a_i + s * r_ij [6 FLOPS]
   ai.x += r.x * s;
   ai.y += r.y * s;
   ai.z += r.z * s;
   return ai;
}

// This is the "tile_calculation" function from the GPUG3 article.
__device__ float3 gravitation(float4 myPos, float3 accel)
{
   extern __shared__ float4 sharedPos[];
   long i=0;
   for (unsigned int counter = 0; counter < blockDim.x; ) 
   {
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
	counter++;
#if LOOP_UNROLL > 1
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
	counter++;
#endif
#if LOOP_UNROLL > 2
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
	counter += 2;
#endif
#if LOOP_UNROLL > 4
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
   accel = bodyBodyInteraction(accel, SX(i++), myPos); 
	counter += 4;
#endif
   }
   return accel;
}
__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)
{
   extern __shared__ float4 sharedPos[];

   float3 acc = {0.0f, 0.0f, 0.0f};
   
   int numTiles = numBodies / (p * q);
   int gtid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++) 
   {
   sharedPos[gtid] = positions[gtid];       
   __syncthreads();
   // This is the "tile_calculation" function from the GPUG3 article.
   acc = gravitation(bodyPos, acc);
   __syncthreads();
   }
   return acc;
}

__global__ void
integrateBodies(float4* oldPos, float4* oldVel,clock_t* timer)
{
	extern __shared__ float4 sharedPos[];

   int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; 
	if (threadIdx.x==0)  timer[blockIdx.x]=clock();
   float4 pos = oldPos[index];   
   float3 accel = computeBodyAccel(pos, oldPos);
   float4 vel = oldVel[index];
   
   vel.x += accel.x * deltaTime;
   vel.y += accel.y * deltaTime;
   vel.z += accel.z * deltaTime;  

   vel.x *= damping;
   vel.y *= damping;
   vel.z *= damping;
   
   // new position = old position + velocity * deltaTime
   pos.x += vel.x * deltaTime;
   pos.y += vel.y * deltaTime;
   pos.z += vel.z * deltaTime;

   // store new position and velocity
   oldPos[index] = pos;
   oldVel[index] = vel;
	if (threadIdx.x==0)  timer[blockIdx.x+blockDim.x]=clock();
}
#endif // #ifndef _NBODY_KERNEL_H_


#ifndef _MYFIRST_H_
#define _MYFIRST_H_

#define p 64
#define q 1
#define numBodies  1024
#define deltaTime  0.01  
#define damping  0.5    
#define eps  0.001 
#endif

[ 本帖最后由 hnuzhoulin 于 2010-3-30 15:38 编辑 ]

system · 2010 年3 月 30 日 06:18

物体的加速度由受力决定，而物体的总受力是其它物体对它施力的和。我没有在你的代码中看到这个，呵呵！

system · 2010 年3 月 30 日 06:18

去掉被包含的cu文件的编译选项~
右键-》去掉编译选项

system · 2010 年3 月 30 日 07:37

看了高性能社区里面的帖子，这个问题解决了。。。

谢谢，嘿嘿，但是又有新问题啊。

0x10009825 处未处理的异常: 0xC0000005: 读取位置 0xcdcdcdd1 时发生访问冲突

这个一般是什么原因啊？？

system · 2010 年3 月 30 日 07:41

我是直接将kernel文件右键->从项目中排除的

在linux下怎么办啊？用的是make，不会有什么问题吧，今天没法用linux系统，明天试一试。

system · 2010 年3 月 30 日 07:45

这个循环是一次遍历各个block：
for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++)
{
sharedPos[gtid] = positions[gtid];
__syncthreads();
// This is the “tile_calculation” function from the GPUG3 article.
acc = gravitation(bodyPos, acc);
__syncthreads();
}

然后在acc = gravitation(bodyPos, acc);函数里面就是在一个block里面进行加速度的求和，依次累加。最后的结果就是总合力。

呵呵，不知道行不行啊？？

system · 2010 年3 月 31 日 04:22

linux下应该没问题吧

system · 2010 年4 月 4 日 05:13

在linux下面却是没有内存溢出的错误，但是
myfirst.cu(19): warning: linkage specification is not allowed
ptxas /tmp/tmpxft_0000439a_00000000-2_myfirst.ptx, line 140; warning : Double is not supported. Demoting to float
myfirst.cu(19): warning: linkage specification is not allowed

其中，Double is not supported. Demoting to float我可以理解，是没有double型，但是我记得我都是用的float啊？在xp下也有类似的警告警告 4 warning C4244: ‘=’ : conversion from ‘double’ to ‘float’, possible loss of data c:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\Project2\Project2\myfirst.cu 46
它指示我有问题的代码行是

h_pos[index].x=alat*(i+rcell[1][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);

vel.x += accel.x * deltaTime;

这样写应该没有问题吧。。

system · 2010 年4 月 4 日 07:33

0.5是double

system · 2010 年4 月 4 日 10:17

那要怎么写啊？
写成0.5f ？？？

system · 2010 年4 月 4 日 12:35

是的

system · 2010 年4 月 5 日 08:55

那如果是表达式怎么办啊？？
就是这个啊
vel.z += acc4.z * deltaTime;
我改成vel.z += (float)acc4.z * deltaTime; 也报错啊

system · 2010 年4 月 5 日 09:11

从表达式上看，应当没错啊！

system · 2010 年4 月 5 日 10:02

我将后面的一个表达式也加了括号，就不报错了，呵呵，谢谢啊
vel.z += (float)(acc4.z * deltaTime)

system · 2010 年4 月 5 日 11:11

呵呵！明白了，你的deltaTime是宏定义的，没加f。