cuda显卡开空间的问题?

本人初学CUDA,有个关于cuda开辟显卡空间的问题。
硬件:Tesla C2050 / C2070
程序为简单的向量相加,诡异的是开一个int整数空间耗时比后面开三个float数组空间的耗时大很多?求解?
附代码如下:
#include <stdio.h>

#include <stdlib.h>
#include <sys/time.h>
// CUDA-C includes
#include <cuda_runtime.h>
#include <cublas.h>

#define THREAD_NUM 512
#define BLOCK_NUM 32

#define MUL 1000000

global void vecadd(int *n, float *a, float *b, float *c)
{
const int tId=threadIdx.x;
const int bId = blockIdx.x;
int Idx = bId * THREAD_NUM + tId;
int i;

for(i=Idx; i<n; i += BLOCK_NUMTHREAD_NUM) {
c[i] = a[i] + b[i];
}
}
int main(int iargc, char **argv)
{
int *n_d, n_h=10000000;
float *a_d, *v_d, *w_d;
float *a_h, *v_h, *w_h;

struct timeval startT, endT;
int time; double tt;

a_h=(float *)malloc(n_h * sizeof(float));
v_h=(float *)malloc(n_h * sizeof(float));
w_h=(float *)malloc(n_h * sizeof(float));

for(int i=0;i<n_h;++i)
{
a_h[i]=1.10;
v_h[i]=1.10;
}

/* Malloc space on GPU device */
int deviceCount;
if(cudaGetDeviceCount(&deviceCount)!=cudaSuccess) {
printf(“There is no device support CUDA!\n”);
exit(0);
}
gettimeofday(&startT, NULL);
cudaSetDevice(1);
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of Device Initialize (us) : %f s\n”, tt );

gettimeofday(&startT, NULL);
cudaMalloc((void **) &n_d, sizeof(int));
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of Device first memory allocating (us) : %f s\n”, tt );

gettimeofday(&startT, NULL);
cudaMalloc((void **) &a_d, sizeof(float)*n_h);
cudaMalloc((void **) &v_d, sizeof(float)*n_h);
cudaMalloc((void **) &w_d, sizeof(float)*n_h);
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of Device memory allocating (us) : %f s\n”, tt );

/* Copy data to GPU device */
gettimeofday(&startT, NULL);
cudaMemcpy(n_d, &n_h, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, sizeof(float) * n_h, cudaMemcpyHostToDevice);
cudaMemcpy(v_d, v_h, sizeof(float) * n_h, cudaMemcpyHostToDevice);
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of Copy data to Device (us) : %f s\n”, tt );

gettimeofday(&startT, NULL);
vecadd<<<BLOCK_NUM,THREAD_NUM,0>>>(n_d, a_d, v_d, w_d);
cudaThreadSynchronize();
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of vecadd (us) : %f s\n”, tt);

gettimeofday(&startT, NULL);
cudaMemcpy(w_d, w_h, sizeof(float) * n_h, cudaMemcpyDeviceToHost);
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of proccess result(us) : %f s\n”, tt );

gettimeofday(&startT, NULL);
cudaFree(n_d);
cudaFree(a_d);
cudaFree(v_d);
cudaFree(w_d);
gettimeofday(&endT, NULL);
time=(endT.tv_sec - startT.tv_sec)*MUL + endT.tv_usec - startT.tv_usec;
tt = double(time)/1000000.0;
printf(“time of free Device memory (us) : %f s\n”, tt );

cudaThreadExit();

free(a_h);
free(v_h);
free(w_h);

return 1;
}

这个是正常的,主要是初始化时间

谢谢!有什么办法能减少耗时吗?

我也遇到同样的问题,而且申请大块内存消耗的时间会更长
估计没什么太好的办法