下面是代码,我感觉是不是我分配的空间太多了啊??
显存一个G,对于这个来说,应该是足够的了啊,我觉得,溢出的话怎么解决啊?
#include "myfirst_kernel.cu"
#include "cutil.h"
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <math.h>
int main( int argc,char** argv) //the main program added by zhoulin 2010.3.4
{
CUT_DEVICE_INIT(argc, argv);
// 4 floats each for alignment reasons
unsigned int memSize = sizeof( float) * numBodies;
//host端分配内存
clock_t* timer=(clock_t*)malloc(32*sizeof(clock_t)); //我们这里并非是计算一个block的时间,而是总共的运算时间
if(timer==NULL) {printf("memory of timer is fault");exit(0);}
float4* h_pos=(float4*)malloc(numBodies*sizeof(float));
if(h_pos==NULL) {printf("memory of h_pos is fault");exit(0);}
float4* h_vel=(float4*)malloc(numBodies*sizeof(float));
if(h_vel==NULL) {printf("memory of h_vel is fault");exit(0);}
//生成初试数据 bodysystemcpu.cpp
float alat=1.5496f;
float disp=0.5f;
int index=0;
float rcell[3][4]={0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.5,0.0,0.5};
srand(int(time(NULL)/2));
for(int k=0;k<2;k++) //8
{
for(int j=0;j<2;j++)//8
{
for(int i=0;i<4;i++)
{
for(int L=0;L<4;L++)
{
h_pos[index].x=alat*(i+rcell[1][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].y=alat*(i+rcell[2][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].z=alat*(i+rcell[3][L])+2.0f*disp*(rand()/(float)RAND_MAX-0.5f);
h_pos[index].w=1.0f;
h_vel[index].x=0.0f;
h_vel[index].y=0.0f;
h_vel[index].z=0.0f;
h_vel[index].w=0.0f;
index++;
}
}
}
}
//device端分配内存
clock_t* dtimer;
CUDA_SAFE_CALL(cudaMalloc((void**)&dtimer, sizeof(clock_t)*16*2));
float4* d_vel;
CUDA_SAFE_CALL(cudaMalloc((float4**)&d_vel,numBodies*sizeof(float));
float4* d_pos;
CUDA_SAFE_CALL(cudaMalloc((float3**)&pos, numBodies*sizeof(float));
//向显存拷入数据
CUDA_SAFE_CALL(cudaMemcpy(d_pos, h_pos, memSize,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_vel, h_vel, memSize,cudaMemcpyHostToDevice));
//运行核函数
dim3 grid(16, 1, 1);
// execute the kernel: we set q=1 here.-----zhoulin 2010.3.4
integrateBodies<<< grid, threads, memSize >>>(d_pos, d_vel,dtimer);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("Kernel execution failed");
//将数据拷回主机内存
CUDA_SAFE_CALL(cudaMemcpy(timer, dtimer,sizeof(clock_t)*32, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_pos, d_pos, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_vel, d_vel, memSize, cudaMemcpyDeviceToHost));
//释放存储器,但是运行时不管有没有以下的free语句,错误提示相同
free(h_pos);
free(h_acc);
free(h_vel);
CUDA_SAFE_CALL(cudaFree(d_pos));
CUDA_SAFE_CALL(cudaFree(d_acc));
CUDA_SAFE_CALL(cudaFree(d_vel));
CUDA_SAFE_CALL(cudaFree(dtimer));
//时间测试
clock_t minStart = timer[0];
clock_t maxEnd = timer[16];
for (int i = 1; i < 16; i++)
{
minStart = timer[i] < minStart ? timer[i] : minStart;
maxEnd = timer[16+i] > maxEnd ? timer[16+i] : maxEnd;
}
printf("time = %d\n", maxEnd - minStart);
CUT_EXIT(argc, argv); //exit CUDA
}
//}
[ 本帖最后由 hnuzhoulin 于 2010-4-5 16:52 编辑 ]