void calcfeatpyramid_fast(…)
{
for (int i=0;i<5;i++) //interval =5
{
cuda_features_4bin(feat[i],feat[i+interval],i,i+5);
for (int j=i+5;j<19;j=j+5)
{
cuda_features_8bin(feat[j+interval],j+interval);
}
}
}
extern “C” void cuda_features_4bin(_feat& feat,_feat& feat8bin,int num4,int num8)
{
…
feat.ft=new float[out0out132];
memset(feat.ft,0,out0out132*4);
cudaMalloc((void **)&dev_MatFeat[num4],out0out1324);
cudaMemset(dev_MatFeat[num4],0,out0out1324);
histnorm2feat<<<out1, out0>>>(dev_hist,dev_norm,dev_MatFeat[num4]);
cudaMemcpy(feat.ft,dev_MatFeat[num4],out0out132*4,cudaMemcpyDeviceToHost);
…
feat8bin.ft=new float[out0out132];
memset(feat8bin.ft,0,out0out132*4);
cudaMalloc((void **)&dev_MatFeat[num8],out0out1324);
cudaMemset(dev_MatFeat[num8],0,out0out1324);
histnorm2feat<<<out1, out0>>>(dev_hist8bins,dev_norm8bins,dev_MatFeat[num8]);
cudaMemcpy(feat8bin.ft,dev_MatFeat[num8],out0out132*4,cudaMemcpyDeviceToHost);
}
extern “C” void cuda_features_8bin(_feat& feat,int num)
{
feat.ft=new float[out0out132];
memset(feat.ft,0,out0out132*4);
cudaMalloc((void **)&dev_MatFeat[num],out0out1324);
cudaMemset(dev_MatFeat[num],0,out0out1324);
histnorm2feat<<<out1, out0>>>(dev_hist,dev_norm,dev_MatFeat[num]); //0.6
cudaMemcpy(feat.ft,dev_MatFeat[num],out0out132*4,cudaMemcpyDeviceToHost);
}
extern “C” void cuda_padarray(…)
{
for (int t=0;t<24;t++)
{
dstfeat[t].ft=new float[dstfeat[t].size[0]*dstfeat[t].size[1]*dstfeat[t].size[2]];
memset(dst.feat[t].ft,0,sizeof(float)*dst.feat[t].size[0]*dst.feat[t].size[1]*dst.feat[t].size[2]);
cudaMalloc((void **)&dev_MatDst[t],dstfeat[t].size[0]*dstfeat[t].size[1]*dstfeat[t].size[2]*4);
cudaMemset(dev_MatDst[t],0,dstfeat[t].size[0]*dstfeat[t].size[1]*dstfeat[t].size[2]*4);
// cudaMemcpy(dev_MatFeat[t],src.feat[t].ft,src.feat[t].size[0]*src.feat[t].size[1]*src.feat[t].size[2]*4,
//cudaMemcpyHostToDevice);
padarray<<< blockFeat, dst.feat[t].size[0]>>>(dev_MatFeat[t],dev_MatDst[t],padx,pady);
cudaMemcpy(dstfeat[t].ft,dev_MatDst[t],dstfeat[t].size[0]*dstfeat[t].size[1]*dstfeat[t].size[2]*4,cudaMemcpyDeviceToHost);
cudaFree(dev_MatFeat[t]);
}
}
好吧,我把主要代码都贴出来
float * dev_MatFeat[30];
float * dev_MatDst[30];
这两个是全局变量
注释部分去掉和没去掉得到不同的结果。要是觉得代码太长就算了[/i]srcfeat是前面三个求得的结果feat