因为自己要做的东西要结合其他库,所以写了个模拟整个流程的简单代码,代码运行结果应该是我想要的,但是单步逐过程调试时却感觉很诡异,莫名其妙的在某两离得很远的行间来回跳跃(下代码中红色代码行)。不知何解??
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//nvidia nvs 300
//float 1.2计算能力 atomicAdd(int*,int)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define pi 3.141593
#define NTHREAD 16
__global__ void loopCalc_Kernel(int *Ibuffer, const float *TransSens, const float *TransPos,float *ReceiveData,
int Irow,int Icol,float Zstart,float PdeltaX,float PdeltaZ,int recRow)
{
int ref=threadIdx.x;
for (int i=8*ref;i<8*(ref+1);i++)
{
for (int n=0;n<Irow;n++)//374
{
for (int m=0;m<Icol;m++)//124
{
int loc=n*Icol+m;
loc=(loc%Icol)*Irow+loc/Icol;
atomicAdd(&Ibuffer[loc],1);
}
}
}
__syncthreads();
}
int main()
{
cudaError_t cudaStatus=cudaSetDevice(0);;//初始化设备0
double* ReceiveData=(double*)malloc(2792*128*sizeof(double));
for (int i=0;i<2792*128;i++)
{
ReceiveData[i]=1.0;
}
double* TransPos=(double*)malloc(128*sizeof(double));
for(int i=0;i<128;i++)
TransPos[i]=-64.0+i;
double* TransSens=(double*)malloc(101*sizeof(double));
for (int i=0;i<101;i++)
{
TransSens[i]=1.0;
}
int recRow=2792;
int recCol=128;
int numTransPos=128;
int numTransSens=101;
int Irow=374,Icol=124,numReceive=128;
float PdeltaX=0.9675,PdeltaZ=0.5,Zstart=5.0;
int *Ibuffer=(int*)malloc(Irow*Icol*sizeof(int));//显卡可能只支持float,此为缓存,用于将显卡计算得到的现存copy到Ibuffer
//
float *iTransPos=0,*iTransSens=0;
int *iIbuffer=0;
float *iReceiveData=0;
//////////////////////////////////////////////////////////////////////////
float* iTransPosTMP=(float*)malloc(numTransPos*sizeof(float));//TransPos为double,先将double转为float到内存,再把内存中结果copy到显存
for (int i=0;i<numTransPos;i++)
{
iTransPosTMP[i]=float(TransPos[i]);
}
float* iTransSensTMP=(float*)malloc(numTransSens*sizeof(float));
for (int i=0;i<numTransSens;i++)
{
iTransSensTMP[i]=float(TransSens[i]);
}
float* iReceiveDataTMP=(float*)malloc(recRow*recCol*sizeof(float));
for (int i=0;i<recRow*recCol;i++)
{
iReceiveDataTMP[i]=float(ReceiveData[i]);
}
cudaStatus=cudaMalloc((void**)&iTransPos,numTransPos*sizeof(float));//分配TransPos 全局显存
cudaStatus=cudaMalloc((void**)&iTransSens,numTransSens*sizeof(float));//分配TransSens全局显存
cudaStatus=cudaMalloc((void**)&iReceiveData,recRow*recCol*sizeof(float));//分配ReceiveData显存
cudaStatus=cudaMalloc((void**)&iIbuffer,Irow*Icol*sizeof(int));//分配计算输出结果显存
cudaMemset(iIbuffer,0,Irow*Icol*sizeof(int));
//将内存copy到显存
cudaStatus=cudaMemcpy(iTransPos,iTransPosTMP,numTransPos*sizeof(float),cudaMemcpyHostToDevice);
cudaStatus=cudaMemcpy(iTransSens,iTransSensTMP,numTransSens*sizeof(float),cudaMemcpyHostToDevice);
cudaStatus=cudaMemcpy(iReceiveData,iReceiveDataTMP,recRow*recCol*sizeof(short),cudaMemcpyHostToDevice);
//分配网格线程
dim3 grid(1,1,1);
dim3 threads(NTHREAD,1,1);
//运行内核函数
loopCalc_Kernel<<<grid,threads>>>(iIbuffer,iTransSens,iTransPos,iReceiveData,Irow,Icol,Zstart,PdeltaX,PdeltaZ,recRow);
cudaStatus = cudaThreadSynchronize();
//将显存中结果cpy到内存
cudaStatus=cudaMemcpy(Ibuffer,iIbuffer,Irow*Icol*sizeof(int),cudaMemcpyDeviceToHost);
int rst=0;
for (int i=0;i<Irow*Icol;i++)
{
rst+=Ibuffer[i];
}
printf("%d\n",rst);
free(Ibuffer);
free(iTransPosTMP);
free(iTransSensTMP);
free(iReceiveDataTMP);
//释放显存
cudaFree(iTransPos);
cudaFree(iTransSens);
cudaFree(iReceiveData);
cudaFree(iIbuffer);
return 0;
}