刚学cuda，问问调试的问题

system · 2013 年8 月 21 日 12:41

因为自己要做的东西要结合其他库，所以写了个模拟整个流程的简单代码，代码运行结果应该是我想要的，但是单步逐过程调试时却感觉很诡异，莫名其妙的在某两离得很远的行间来回跳跃（下代码中红色代码行）。不知何解？？

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

//nvidia nvs 300
//float 1.2计算能力 atomicAdd(int*,int)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define pi 3.141593
#define NTHREAD 16

__global__ void loopCalc_Kernel(int *Ibuffer, const float *TransSens, const float *TransPos,float *ReceiveData,
								int Irow,int Icol,float Zstart,float PdeltaX,float PdeltaZ,int recRow)
{
	int ref=threadIdx.x;
	for (int i=8*ref;i<8*(ref+1);i++)
	{
		for (int n=0;n<Irow;n++)//374
		{
			for (int m=0;m<Icol;m++)//124
			{
				int loc=n*Icol+m;
				loc=(loc%Icol)*Irow+loc/Icol;
				atomicAdd(&Ibuffer[loc],1);
			}
		}
	}
	__syncthreads();

}

int main()
{
	cudaError_t cudaStatus=cudaSetDevice(0);;//初始化设备0

	double* ReceiveData=(double*)malloc(2792*128*sizeof(double));
	for (int i=0;i<2792*128;i++)
	{
		ReceiveData[i]=1.0;
	}
	double* TransPos=(double*)malloc(128*sizeof(double));
	for(int i=0;i<128;i++)
		TransPos[i]=-64.0+i;
	double* TransSens=(double*)malloc(101*sizeof(double));
	for (int i=0;i<101;i++)
	{
		TransSens[i]=1.0;
	}

	int recRow=2792;
	int recCol=128;
	int numTransPos=128;
	int numTransSens=101;

	int Irow=374,Icol=124,numReceive=128;
	float PdeltaX=0.9675,PdeltaZ=0.5,Zstart=5.0;

	int *Ibuffer=(int*)malloc(Irow*Icol*sizeof(int));//显卡可能只支持float，此为缓存，用于将显卡计算得到的现存copy到Ibuffer
	//
	float *iTransPos=0,*iTransSens=0;
	int *iIbuffer=0;
	float *iReceiveData=0;

	//////////////////////////////////////////////////////////////////////////
	float* iTransPosTMP=(float*)malloc(numTransPos*sizeof(float));//TransPos为double，先将double转为float到内存，再把内存中结果copy到显存
	for (int i=0;i<numTransPos;i++)
	{
		iTransPosTMP[i]=float(TransPos[i]);
	}
	float* iTransSensTMP=(float*)malloc(numTransSens*sizeof(float));
	for (int i=0;i<numTransSens;i++)
	{
		iTransSensTMP[i]=float(TransSens[i]);
	}
	float* iReceiveDataTMP=(float*)malloc(recRow*recCol*sizeof(float));
	for (int i=0;i<recRow*recCol;i++)
	{
		iReceiveDataTMP[i]=float(ReceiveData[i]);
	}

	cudaStatus=cudaMalloc((void**)&iTransPos,numTransPos*sizeof(float));//分配TransPos 全局显存

	cudaStatus=cudaMalloc((void**)&iTransSens,numTransSens*sizeof(float));//分配TransSens全局显存

	cudaStatus=cudaMalloc((void**)&iReceiveData,recRow*recCol*sizeof(float));//分配ReceiveData显存

	cudaStatus=cudaMalloc((void**)&iIbuffer,Irow*Icol*sizeof(int));//分配计算输出结果显存

	cudaMemset(iIbuffer,0,Irow*Icol*sizeof(int));

	//将内存copy到显存
	cudaStatus=cudaMemcpy(iTransPos,iTransPosTMP,numTransPos*sizeof(float),cudaMemcpyHostToDevice);

	cudaStatus=cudaMemcpy(iTransSens,iTransSensTMP,numTransSens*sizeof(float),cudaMemcpyHostToDevice);

	cudaStatus=cudaMemcpy(iReceiveData,iReceiveDataTMP,recRow*recCol*sizeof(short),cudaMemcpyHostToDevice);

	//分配网格线程
	dim3 grid(1,1,1);
	dim3 threads(NTHREAD,1,1);
	//运行内核函数
	loopCalc_Kernel<<<grid,threads>>>(iIbuffer,iTransSens,iTransPos,iReceiveData,Irow,Icol,Zstart,PdeltaX,PdeltaZ,recRow);

	cudaStatus = cudaThreadSynchronize();

	//将显存中结果cpy到内存
	cudaStatus=cudaMemcpy(Ibuffer,iIbuffer,Irow*Icol*sizeof(int),cudaMemcpyDeviceToHost);

	int rst=0;
	for (int i=0;i<Irow*Icol;i++)
	{
		rst+=Ibuffer[i];
	}
printf("%d\n",rst);
	free(Ibuffer);
	free(iTransPosTMP);
	free(iTransSensTMP);
	free(iReceiveDataTMP);
	//释放显存
	cudaFree(iTransPos);
	cudaFree(iTransSens);
	cudaFree(iReceiveData);
	cudaFree(iIbuffer);
	return 0;
}

system · 2013 年8 月 21 日 12:42

额，没显示出来，就是75行和112行来回跳跃。好奇怪

system · 2013 年8 月 21 日 13:51

楼主您好，通过仔细阅读您的代码，

您的75行和112行均是普通的host code, 按理说不至于突然从75行跳到112行，或者从112行跳到75行的。

我唯一能想象出来的是，您这是个多线程的host code, 同时您在75行和112行设置了断点，分别在2个线程中命中了75行的断点和112行的断点，结果给您带来了来回跳跃的错觉。
（您可以在调试状态通过debug菜单的windows->threads来观察各个线程的当前状态）
（是的，虽然您看上去是从main()开始执行的，但这个不一定，您可以有其他的程序入口，并在那里从main()执行了2个host thread)
(如果您没有隐含没发的代码，请无视此解释）

这可能是较为可能的解释，但如果对您不适用，请无视此解释。并建议其他会员等尝试为您解答。

system · 2013 年8 月 22 日 01:28

谢谢斑竹的耐心解答。我看了下，运行到75行时只有一个主线程。在运行到cudamalloc后，才会出现两个辅线程。
我刚才看了下，似乎是编译器优化产生的问题，我将cuda runtime api 中optimization 改为 disable，似乎就行了

system · 2013 年8 月 22 日 01:31

请不要在调试的时候开启优化，感谢来访。

system · 2013 年8 月 22 日 01:32

以及，您未能理解我之前的“多个线程”含义，这里指的是您自己手工创建的host threads, 而不含cuda runtime自己创建的。

之前说过，如果您的代码是从main开始执行的，请无视此建议。

感谢来访。

system · 2013 年8 月 22 日 03:20

谢谢斑竹，我没有create threads，但现在我的问题基本解决了，已经有结果了。万分感谢！