cudaHostAlloc

system · 2013 年4 月 23 日 06:01

在Nsight for Eclipse 使用 cudaHostAlloc就是不能识别
cudaHostAlloc ((void **)&a,3,cudaHostAllocMapped)

size_t size =3
cudaHostAlloc ((void **)&a,size,cudaHostAllocMapped)

cudaHostAlloc (a,3,cudaHostAllocMapped)

都不能识别

直接用nvcc 可以通过只是随后的Zero copy没有拷贝进去任何内容 ,而cuda-memcheck a.out 查不出任何内容

system · 2013 年4 月 23 日 07:01

size好像应该按字节数算，你试试size=3*sizeof(a的数据类型)行不。

system · 2013 年4 月 23 日 07:17

换成sizeof(int) 不行我的程序是这样的有没有高手帮看看能不能调过

/**

Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
Please refer to the NVIDIA end user license agreement (EULA) associated
with this source code for terms and conditions that govern your use of
this software. Any use, reproduction, disclosure, or distribution of
this software and related documentation outside the terms of the EULA
is strictly prohibited.
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

define N 3

global void add(int *a,int *b,int *c,int *dev_a,int dev_b,int dev_c){
int tid = blockIdx.xblockDim.x +threadIdx.x;
printf(“%d \n”,tid);
/
*dev_a = *a;
*dev_b = *b;
*dev_c = *c;
printf(" tid : %d a : %d b: %d c: %d \n",tid,*a,*b,*c);
/
}
/
global void add(int *a,int *b,int c){
int tid = blockIdx.xblockDim.x +threadIdx.x;
printf(" tid : %d a : %d b: %d c: %d \n",tid,*a,*b,*c);
// if(tid <N) c[tid]= a[tid]+b[tid];
}
*/

/**

Host function that prepares data array and passes it to the CUDA kernel.
*/
int main(void) {
int T =32,B =1;
int *a = new int(1);
int *b = new int(2);
int *c = new int(3);
int *dev_a,*dev_b,*dev_c;

cudaEvent_t start,stop;
float elapsed_time_ms;

size_t size =3;
*a = 1;
*b = 2;
*c = 3;

//cudaHostAlloc( (void**)&a, size, cudaHostAllocMapped || cudaHostAllocWriteCombined );
cudaHostAlloc( &a, 3sizeof(int), cudaHostAllocMapped); // word checking
printf("size : %d ",size);
// cudaHostAlloc( (void**)&b, sizesizeof(int), cudaHostAllocMapped || cudaHostAllocWriteCombined );
cudaHostAlloc( &b, size*sizeof(int), cudaHostAllocMapped);
cudaHostAlloc(&c, size, cudaHostAllocMapped );

// load arrays with some numbers

cudaHostGetDevicePointer(&dev_a, a, 0); // mem. copy to device not need now, but ptrs needed instead
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 1 "%s".\n”,cudaGetErrorString(cudaerr));

cudaHostGetDevicePointer(&dev_b, b, 0);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 2 "%s".\n”,cudaGetErrorString(cudaerr));

cudaHostGetDevicePointer(&dev_c ,c, 0);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 3"%s".\n”,cudaGetErrorString(cudaerr));

// start time
cudaEventCreate(&start);
add<<<B,T>>>(a,b,c,dev_a,dev_b,dev_c);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 4"%s".\n”,cudaGetErrorString(cudaerr));

/*
cudaThreadSynchronize(); // copy back not needed but now need thread synchronization
cudaEventCreate(&stop);
// end time
// print results
cudaEventElapsedTime(&elapsed_time_ms,start,stop);

printf(“Time to calculate results: %f ms.\n”, elapsed_time_ms); // print out execution time

cudaFreeHost(a); // clean up
cudaFreeHost(b);
cudaFreeHost(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
*/

[i][i][i]return 0;
}

[/i][/i][/i]

system · 2013 年4 月 23 日 07:19

楼主的“不能识别”存在歧义：
（1）如果是eclipse的IDE智能感知无法提示cudaHostAlloc()的原型信息，我建议楼主到eclipse的开发组询问此问题。
（2）如果是“随后的zero copy"不能成功（例如在kernel中读取host mapped memory中的内容失败), 则请注意：

(a)请检查是否存在2楼建议的问题。
(b)您的程序开头是否有cudaDeviceMapHost为参数（或者多个参数|之一有它)的cudaSetDeviceFlags函数调用。
(c)请检查失败是直接使用了cudaHostAlloc出的指针，而忘记使用cudaHostGetDevicePointer()来获取对应的设备指针。
(d)请检查您是否在kernel中使用的是HostGetDevicePointer出来的对应设备指针。

注意：如果您的卡支持并启用了unified addressing, 则您可以直接使用。无需单独获取设备指针。

注意：以上回答是对您的“cudaHostAlloc就是不能识别”进行推断出来的您的意思，并接着回答的。
如果您的意思不是我上文列出的2项，建议重新正确表达您的意图。（您也可以不表达，只是建议）

感谢您的来访。

system · 2013 年4 月 23 日 07:26

您好
我把程序写在上面了不在您给的建议之内遇到问题的确比较棘手
用lspci -v 命令 NVIDIA Corporation Device 1140
驱动已经安装是Geforce 630系列

system · 2013 年4 月 23 日 07:31

您的问题（或者您存在多个问题之一）在我的建议范围内，

请看我的4#回复的(b)条：请使用cudaSetDeviceFlags(cudaDeviceMapHost)在贵main()函数体的开头。

您可以再次忽略此建议，我将按照您的要求，不再建议。

以及，如果您能接受此建议，我将不盛荣幸。
以及，请修改后重新汇报问题是否消失，本回答指出了一个问题，但并不排除没有更多其他的问题。

system · 2013 年4 月 23 日 08:34

但我把主程序改成这样还是有问题
int T =32,B =1;
int *a = new int(1);
int *b = new int(2);
int *c = new int(3);
int *dev_a,*dev_b,*dev_c;

cudaEvent_t start,stop;
float elapsed_time_ms;

size_t size =3;
*a = 1;
*b = 2;
*c = 3;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)
{
int a =0;
printf(“Kernel launch filed with error 1 "%s".\n”,cudaGetErrorString(cudaerr));
}

//cudaHostAlloc( (void**)&a, size, cudaHostAllocMapped || cudaHostAllocWriteCombined );
cudaHostAlloc( &a, 3sizeof(int), cudaHostAllocMapped); // word checking
printf("size : %d ",size);
// cudaHostAlloc( (void**)&b, sizesizeof(int), cudaHostAllocMapped || cudaHostAllocWriteCombined );
cudaHostAlloc( &b, size, cudaHostAllocMapped);
cudaHostAlloc(&c, size*sizeof(int), cudaHostAllocMapped );

// load arrays with some numbers

cudaHostGetDevicePointer((void **)&dev_a, (void *)a, 0); // mem. copy to device not need now, but ptrs needed instead
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 1 "%s".\n”,cudaGetErrorString(cudaerr));

cudaHostGetDevicePointer((void **)&dev_b, (void *)b, 0);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 2 "%s".\n”,cudaGetErrorString(cudaerr));

cudaHostGetDevicePointer((void **)&dev_c ,(void *)c, 0);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 3"%s".\n”,cudaGetErrorString(cudaerr));

// start time
cudaEventCreate(&start);
add<<<B,T>>>(a,b,c,dev_a,dev_b,dev_c);
cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)printf(“Kernel launch filed with error 4"%s".\n”,cudaGetErrorString(cudaerr));

system · 2013 年4 月 23 日 09:58

楼主的逻辑上已经大致无问题了，
先设置flags,
在进行HostAllocaMapped参数的Host Allocation,
最后使用HostGetDevicePointer,

逻辑上这里是无问题，那么问题可能出在别的地方。
例如你的add kernel(你没有给出), 是否正确？

我看到它使用了3个缓冲区，分别是3 * sizeof(int)大小(缓冲区a), 3 Bytes(缓冲区b), 3 * sizeof(int)大小(缓冲区c), 如果您计算的是3个元素的向量相加。这里应该是不对的。
这是其一。

其二，楼主先对host上的指针a,b,c分配空间，并初始化为1,2,3; 然后再次初始化为1,2,3; 最后再次使用cudaHostMalloc, 再次分配，丢弃了原来分配的三次空间，是否符合愿意？

其三，楼主接受了对最后一次分配后的a,b,c取得device pointer。但却依然使用了：
<<<>>>(a,b,c,dev_a,dev_b,dev_c) （其中前3者是后3者的host pointer, 不考虑unified addressing的话)。
是否这里违背了您的原意？

请三思。

system · 2013 年4 月 23 日 09:58

楼主的问题多多，但不妨先考虑上面的三个问题。先改正了再说。

system · 2013 年4 月 28 日 05:20

谢谢问题已解决

system · 2013 年4 月 28 日 07:27

您客气了，服务您是我们的荣幸。