意思是这样,但是你不能这样写,一般是先定义之后开辟空间,再将它赋值给它的上一层下面的指针。
我这里由于设计问题,需要的是开辟ParticleSize 个Net型的GPU空间变量,即指针是我要传递给Kernel函数的参数。
Net *dev_net = NULL; Net *my_net = NULL;
result = cudaMalloc((void**)&dev_net,ParticleSize*sizeof(Net)); if (result != cudaSuccess) return false; my_net = (Net *)malloc(ParticleSize*sizeof(Net));
但是我的设计中Net结构是这样的:
struct Layer { int Cell; float **Weight; float *Threshold; };struct Net { Layer *NetLayer; }ANNet;
下面的指针使用太多了。所以在使用my_net来初始化dev_net的时候首先要保证my_net下面的指针都是指向GPU的,不然的话你的dev_net下面的指针仍然是指向CPU的,所以在Kernel函数 中就会出现dev_net内存无法读取的错误。
我最终的设计是换成了一位数组,使用下标偏移来控制。。
for (int i=0;i<ParticleSize;i++) my_net[i].NetLayer = GenerateCUDALayer();
result = cudaMemcpy(dev_net,my_net,ParticleSize*sizeof(Net),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
其中设计为:(供参考)
//用来为device端指针分配内存,返回一个存放在host端的指针,它指向device端的global memoryLayer* GenerateCUDALayer()
{ //这里稍有问题 //思路:直接只分配内存,不涉及内容数据复制了,直接分配出四个float类型device变量 //在分配出一个带有三个层的层结构体指针,将值直接复制给他, //这样就能得到所有的指针均为d端的一个变量。
cudaError_t result;
Layer *l = NULL; Layer *dev_l = NULL;
//处理下一级的指针
/************************* 这里是分界线
*****************************************/ //输入层 float **dev_wt0 = NULL;//[HN][IN] float *dev_wt0_0 = NULL; float *wt0_0 = NULL; //用于初始化*dev_wt0_0的赋值 float **wt0=NULL;
float dev_th0[HN];
result = cudaMalloc((void**)&dev_wt0,HN*sizeof(float*)); if (result != cudaSuccess) return false; result = cudaMalloc((void**)&dev_wt0_0,IN*sizeof(float)); if (result != cudaSuccess) return false; wt0_0 = (float *)malloc(IN*sizeof(float)); for (int i=0;i<IN;i++) wt0_0[i] = (float)i; //用于后面测试的数据 result = cudaMemcpy(dev_wt0_0,wt0_0,IN*sizeof(float),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
wt0 = (float **)malloc(HN*sizeof(float*)); for (int i=0;i<HN;i++) wt0[i] = dev_wt0_0 + i*IN;
result = cudaMemcpy((float *)dev_wt0,(float *)wt0,HN*sizeof(float
*),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
result = cudaMalloc((void**)&dev_th0,HN*sizeof(float)); if (result != cudaSuccess) return false;
/************************* 这里是分界线
*****************************************/ //隐含层 float **dev_wt1 = NULL;//[HN][IN] float *dev_wt1_0 = NULL; float *wt1_0 = NULL; float **wt1=NULL;
float dev_th1[HN];
result = cudaMalloc((void**)&dev_wt1,HN*sizeof(float*)); if (result != cudaSuccess) return false; result = cudaMalloc((void**)&dev_wt1_0,HN*sizeof(float)); if (result != cudaSuccess) return false;
wt1_0 = (float *)malloc(HN*sizeof(float)); for (int i=0;i<HN;i++) wt1_0[i] = (float)i; //用于后面测试的数据
result = cudaMemcpy(dev_wt1_0,wt1_0,HN*sizeof(float),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
wt1 = (float **)malloc(HN*sizeof(float*)); for (int i=0;i<HN;i++) wt1[i] = dev_wt1_0 + i*IN;
result = cudaMemcpy((float *)dev_wt1,(float *)wt1,HN*sizeof(float
*),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
result = cudaMalloc((void**)&dev_th1,HN*sizeof(float)); if (result != cudaSuccess) return false;
/************************* 这里是分界线
*****************************************/ //输出层 float **dev_wt2 = NULL;//[ON][HN] float *dev_wt2_0 = NULL; float *wt2_0 = NULL; float **wt2=NULL;
float dev_th2[ON];
result = cudaMalloc((void**)&dev_wt2,ON*sizeof(float*)); if (result != cudaSuccess) return false; result = cudaMalloc((void**)&dev_wt2_0,ON*sizeof(float)); if (result != cudaSuccess) return false;
wt2_0 = (float *)malloc(ON*sizeof(float)); for (int i=0;i<ON;i++) wt2_0[i] = (float)i; //用于后面测试的数据
result = cudaMemcpy(dev_wt2_0,wt2_0,ON*sizeof(float),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
wt2 = (float **)malloc(ON*sizeof(float*)); for (int i=0;i<ON;i++) wt2[i] = dev_wt2_0 + i*HN;
result = cudaMemcpy((float *)dev_wt2,(float *)wt2,ON*sizeof(float
*),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
result = cudaMalloc((void**)&dev_th2,ON*sizeof(float)); if (result != cudaSuccess) return false;
/************************* 这里是分界线
*****************************************/ l=(Layer *)malloc(Layer_Num*sizeof(Layer));
l[0].Cell = IN; l[1].Cell = HN; l[1].Threshold = dev_th1; l[1].Weight = dev_wt1;
//赋值输出层的指针 l[2].Cell = ON; l[2].Threshold = dev_th2; l[2].Weight = dev_wt2;
result = cudaMalloc((void**)&dev_l,Layer_Num*sizeof(Layer)); if (result != cudaSuccess) return false;
result = cudaMemcpy(dev_l,l,Layer_Num*sizeof(Layer),cudaMemcpyHostToDevice); if (result != cudaSuccess) return false;
return dev_l;}
木有办法呀,代码有点乱,我也调不好呀,凑合理解一下就好了