这个问题已经解决了,但是又有新问题啊
0x10009825 处未处理的异常: 0xC0000005: 读取位置 0xcdcdcdd1 时发生访问冲突
这个一般是什么原因啊??出现在分配显存的时候
错误 14 error LNK2005: __Z15integrateBodiesP6float4S0_Pl 已经在 myfirst.obj 中定义 myfirst_kernel.obj
错误 15 error LNK2005: ___device_stub__Z15integrateBodiesP6float4S0_Pl 已经在 myfirst.obj 中定义 myfirst_kernel.obj
错误 16 error LNK2005: "void __cdecl integrateBodies__entry(struct float4 *,struct float4 *,long *)" (?integrateBodies__entry@@YAXPAUfloat4@@0PAJ@Z) 已经在 myfirst.obj 中定义 myfirst_kernel.obj
错误 17 fatal error LNK1169: 找到一个或多个多重定义的符号 C:\Documents and Settings\All Users\Application Data\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\Project2\EmuDebug\Project2.exe
我的原程序如下:
//this is file myfirst.cu
#include "myfirst_kernel.cu"
#include "myfirst.h"
//#include <cutil_inline.h>
#include "cutil.h"
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
//added 2010.3.22
#include <string.h>
#include <math.h>
extern "C"
{
int main( int argc,char** argv) //the main program added by zhoulin 2010.3.4
{
CUT_DEVICE_INIT(argc, argv);
// 4 floats each for alignment reasons
unsigned int memSize = sizeof( float4) * numBodies;
//host端分配内存
clock_t * timer=NULL; //我们这里并非是计算一个block的时间,而是总共的运算时间
float4* h_pos=(float4*)malloc(numBodies);
float3* h_acc=(float3*)malloc(numBodies);
float4* h_vel=(float4*)malloc(numBodies);
//生成初试数据 bodysystemcpu.cpp
float alat=1.5496;
float disp=0.5;
float rcell[3][4]={0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.0,0.5};
srand(int(time(NULL)/2));
for(int k=0;k<8;k++)
{
for(int j=0;j<8;j++)
{
for(int i=0;i<4;i++)
{
for(int L=0;L<4;L++)
{
int index=k*8+j*8+i*4+L*4;
h_pos[index].x=alat*(i+rcell[1][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
h_pos[index].y=alat*(i+rcell[2][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
h_pos[index].z=alat*(i+rcell[3][L])+2.0*disp*(rand()/(float)RAND_MAX-0.5);
h_pos[index].w=1.0;
h_vel[index].x=0.0;
h_vel[index].y=0.0;
h_vel[index].z=0.0;
h_vel[index].w=0.0;
printf("位置:%f-%f-%f-%f\n",h_pos[index].x,h_pos[index].y,h_pos[index].z);
printf("速度:%f-%f-%f-%f\n",h_vel[index].x,h_vel[index].y,h_vel[index].z);
}
}
}
}
//device端分配内存
clock_t* dtimer=NULL;
cudaMalloc((void**)&dtimer, sizeof(clock_t)*16*2);
float4* d_pos=NULL;
cudaMalloc((void**)&d_pos, numBodies);
float3* d_acc=NULL;
cudaMalloc((void**)&d_acc, numBodies);
float4* d_vel=NULL;
cudaMalloc((void**)&d_vel, numBodies);
//向显存拷入数据
CUDA_SAFE_CALL(cudaMemcpy(d_pos, h_pos, memSize,cudaMemcpyHostToDevice));
//运行核函数
//int sharedMemSize = 3*p * sizeof(float4); // each 4 floats for pos,vel and acc
dim3 threads(p,1,1);
dim3 grid(16, 1, 1);
// execute the kernel: we set q=1 here.-----zhoulin 2010.3.4
integrateBodies<<< grid, threads,memSize >>>(d_pos, d_vel,dtimer);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
//将数据拷回主机内存
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_pos, d_pos, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_vel, d_vel, memSize, cudaMemcpyDeviceToHost));
//释放存储器
//cuda_safe_call(cudaFreeHost(h_pos));
//cuda_safe_call(cudaFreeHost(h_acc));
//cuda_safe_call(cudaFreeHost(h_vel));
free(h_pos);
free(h_acc);
free(h_vel);
CUDA_SAFE_CALL(cudaFree(d_pos));
CUDA_SAFE_CALL(cudaFree(d_acc));
CUDA_SAFE_CALL(cudaFree(d_vel));
CUDA_SAFE_CALL(cudaFree(dtimer));
//时间测试
clock_t minStart = timer[0];
clock_t maxEnd = timer[16];
for (int i = 1; i < 16; i++)
{
minStart = timer[i] < minStart ? timer[i] : minStart;
maxEnd = timer[16+i] > maxEnd ? timer[16+i] : maxEnd;
}
printf("time = %d\n", maxEnd - minStart);
CUT_EXIT(argc, argv); //exit CUDA
}
}
/* this is the file myfirst_kernel.cu
*/
#ifndef _MYFIRST_KERNEL_H_
#define _MYFIRST_KERNEL_H_
#include <math.h>
#include "myfirst.h"
#define LOOP_UNROLL 4
// Macros to simplify shared memory addressing
#define SX(i) sharedPos[i+blockDim.x*threadIdx.y]
__device__ float3 bodyBodyInteraction(float3 ai, float4 bi, float4 bj)
{
float3 r;
// r_ij [3 FLOPS]
r.x = bi.x - bj.x;
r.y = bi.y - bj.y;
r.z = bi.z - bj.z;
// distSqr = dot(r_ij, r_ij) + EPS^2 [6 FLOPS]
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
distSqr += eps;
// invDistCube =1/distSqr^(3/2) [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
float invDist = 1.0f / sqrtf(distSqr);
float invDistCube = invDist * invDist * invDist;
//float distSixth = distSqr * distSqr * distSqr;
//float invDistCube = 1.0f / sqrtf(distSixth);
// s = m_j * invDistCube [1 FLOP]
float s = bj.w * invDistCube;
// a_i = a_i + s * r_ij [6 FLOPS]
ai.x += r.x * s;
ai.y += r.y * s;
ai.z += r.z * s;
return ai;
}
// This is the "tile_calculation" function from the GPUG3 article.
__device__ float3 gravitation(float4 myPos, float3 accel)
{
extern __shared__ float4 sharedPos[];
long i=0;
for (unsigned int counter = 0; counter < blockDim.x; )
{
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter++;
#if LOOP_UNROLL > 1
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter++;
#endif
#if LOOP_UNROLL > 2
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter += 2;
#endif
#if LOOP_UNROLL > 4
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
accel = bodyBodyInteraction(accel, SX(i++), myPos);
counter += 4;
#endif
}
return accel;
}
__device__ float3 computeBodyAccel(float4 bodyPos, float4* positions)
{
extern __shared__ float4 sharedPos[];
float3 acc = {0.0f, 0.0f, 0.0f};
int numTiles = numBodies / (p * q);
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
for (int tile = blockIdx.y; tile < numTiles + blockIdx.y; tile++)
{
sharedPos[gtid] = positions[gtid];
__syncthreads();
// This is the "tile_calculation" function from the GPUG3 article.
acc = gravitation(bodyPos, acc);
__syncthreads();
}
return acc;
}
__global__ void
integrateBodies(float4* oldPos, float4* oldVel,clock_t* timer)
{
extern __shared__ float4 sharedPos[];
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if (threadIdx.x==0) timer[blockIdx.x]=clock();
float4 pos = oldPos[index];
float3 accel = computeBodyAccel(pos, oldPos);
float4 vel = oldVel[index];
vel.x += accel.x * deltaTime;
vel.y += accel.y * deltaTime;
vel.z += accel.z * deltaTime;
vel.x *= damping;
vel.y *= damping;
vel.z *= damping;
// new position = old position + velocity * deltaTime
pos.x += vel.x * deltaTime;
pos.y += vel.y * deltaTime;
pos.z += vel.z * deltaTime;
// store new position and velocity
oldPos[index] = pos;
oldVel[index] = vel;
if (threadIdx.x==0) timer[blockIdx.x+blockDim.x]=clock();
}
#endif // #ifndef _NBODY_KERNEL_H_
#ifndef _MYFIRST_H_
#define _MYFIRST_H_
#define p 64
#define q 1
#define numBodies 1024
#define deltaTime 0.01
#define damping 0.5
#define eps 0.001
#endif
[ 本帖最后由 hnuzhoulin 于 2010-3-30 15:38 编辑 ]