host device 三维数组数据传递 cudaMalloc3D


#include <stdio.h>
#include <cutil.h>
#include <cuda_runtime.h>

#define Width  2
#define Height 2
#define Depth  2
int bmp1[Width][Height][Depth]={11,22,33,44,55,66,77,88};
int bmp2[Width][Height][Depth];

int main(){

   cudaPitchedPtr devPitchedPtr;
   cudaExtent extent = make_cudaExtent(2, 2, 2);
   cudaMalloc3D(&devPitchedPtr, extent);
   cudaMemset3D( devPitchedPtr, 0, extent);


   cudaError status;
   cudaMemcpy3DParms HostToDev = { 0 }; 
   HostToDev.srcPtr = make_cudaPitchedPtr((void*)bmp1, sizeof(int), 2, 2); 
   HostToDev.dstPtr = devPitchedPtr; 
   HostToDev.extent = extent; 
   HostToDev.kind   = cudaMemcpyHostToDevice; 
   status = cudaMemcpy3D(&HostToDev);
   if(status != cudaSuccess){
   fprintf(stderr, "MemcpyHtD: %s\n", cudaGetErrorString(status));
   }

   cudaMemcpy3DParms DevToHost = {0};
   DevToHost.srcPtr = devPitchedPtr;
   DevToHost.dstPtr = make_cudaPitchedPtr((void*)bmp2, sizeof(int), 2, 2); 
   DevToHost.extent = extent;
   DevToHost.kind   = cudaMemcpyDeviceToHost;
   status = cudaMemcpy3D(&DevToHost);
   if(status != cudaSuccess){
   fprintf(stderr, "MemcpyHtD: %s\n", cudaGetErrorString(status));
   }
   cudaFree(&devPitchedPtr);
   
   int i, j, k;
   for(i=0; i<2; i++)
   for(j=0; j<2; j++)
   for(k=0; k<2; k++)
   printf("bmp2[%d][%d][%d]=%d\n", i, j, k, bmp2[i][j][k]);

   return 0;
}
问题1:cudaError_t cudaMalloc3D (struct cudaPitchedPtr *  pitchedDevPtr,   struct cudaExtent  extent)
在设备上分配至少width * height * depth bytes的内存,  返回的pitchedDevPtr包含有xsize和ysize,也就是extent中的width和height,请问对于没有出现的depth怎么理解?

问题2:结果为
bmp2[0][0][0]=11
bmp2[0][0][1]=22
bmp2[0][1][0]=33
bmp2[0][1][1]=44
bmp2[1][0][0]=0
bmp2[1][0][1]=0
bmp2[1][1][0]=0
bmp2[1][1][1]=0
请按任意键继续. . .

按自己的理解结果应该为
bmp2[0][0][0]=11
bmp2[0][0][1]=22
bmp2[0][1][0]=33
bmp2[0][1][1]=44
bmp2[1][0][0]=55
bmp2[1][0][1]=66
bmp2[1][1][0]=77
bmp2[1][1][1]=88
知道问题出在make_cudaPitchedPtr的内存对齐上,对这点不是很理解,请大家帮忙指导一下,谢谢。



如果是3维数据,那么数据的寻址方式是:
Host Code:


cudaPitchedPtr data;
extent = make_cudaExtent(width* sizeof(float4), height, depth);
cudaMalloc3D(&(d_data), extent);

Kernel Code:

global kernel(char* data, size_t pitch, …) {


float4 element = ((float4) (data + (xsizeof(float4) + ypitch + zpitchheight)));

}

也就是pitch 指的是实际分配的图像宽,它一般是大于原始的图像宽的

没看懂。不好意思