cuda openmp 第二个GPU比第一个GPU多花费200ms?

为什么第二个GPU花费比第一个多了近200ms

通过隐藏gpu的操作分析,在第一步的cudamalloc中就出现了这个差异。

求解决方法。
不然结果跟一个GPU的时间一样了,没加速效果了都

——————————————————
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>

#include <cuda_runtime.h>

#include <omp.h>

#define band 224

#define thread_num 256

bool InitCUDA() {

int count; 

cudaGetDeviceCount(&count); 

if(count == 0) { 

	printf("There is no device.\n"); 

	return false; 

} 

printf("there are %d GPUs!\n",count);

int i; 

for(i = 0; i < count; i++) { 

	cudaDeviceProp prop; 

	if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { 

		if(prop.major >= 1) {printf("GPU %d : %s\n",i,prop.name); break; } 

	} 

} 

cudaSetDevice(i);

return true; 

}

global void gpucovto0(float *gpucov)//covž³0

{

const int bidx=blockIdx.x;

const int tidx=threadIdx.x;

gpucov[bidx*band+tidx]=0;

}

global void duijiao(unsigned char *gpurawdata, float *gpucov, long imagesize)

{

__shared__ float x[thread_num],mul[thread_num];

const int bidx=blockIdx.x;

const int tidx=threadIdx.x;

const int size=blockDim.x;

if (tidx<band)

{

	gpucov[bidx*band+tidx]=0;

}

unsigned int tid=threadIdx.x;

float x1=0;float mul1=0;

float temp;

while(tid<imagesize)

{

	temp=gpurawdata[bidx*imagesize+tid];

	x1+=temp;

	mul1+=temp*temp;

	tid+=size;

}

x[tidx]=x1;

mul[tidx]=mul1;

__syncthreads();

unsigned int i=blockDim.x/2;

while (i!=0)

{

	if (tidx<i)

	{

		x[tidx]+=x[tidx+i];

		mul[tidx]+=mul[tidx+i];

	}

	__syncthreads();	

	i/=2;

}

if (tidx==0)

{

	float img=imagesize;

	gpucov[bidx*band+bidx]=(mul[0]-x[0]*x[0]/img)/(img-1);

}

}

global void feiduijiao(unsigned char *gpurawdata, float *gpucov, long imagesize, int p)

{

__shared__ float x[thread_num],y[thread_num],mul[thread_num];

const int bidx=blockIdx.x;

const int tidx=threadIdx.x;

const int size=blockDim.x;

unsigned int tid=threadIdx.x;

float x1=0,y1=0,mul1=0;

float temp0,temp1;

if (p<bidx)

{

	while(tid<imagesize)

	{

		temp0=gpurawdata[p*imagesize+tid];

		temp1=gpurawdata[bidx*imagesize+tid];

		x1+=temp0;

		y1+=temp1;

		mul1+=temp0*temp1;

		tid+=size;

	}

	x[tidx]=x1;

	y[tidx]=y1;

	mul[tidx]=mul1;

	__syncthreads();

	unsigned int i=blockDim.x/2;

	while (i!=0)

	{

		if (tidx<i)

		{

			x[tidx]+=x[tidx+i];

			y[tidx]+=y[tidx+i];

			mul[tidx]+=mul[tidx+i];

		}

		__syncthreads();	

		i/=2;

	}

	if (tidx==0)

	{

		float img=imagesize;

		float temp=(mul[0]-x[0]*y[0]/img)/(img-1);

		gpucov[bidx*band+p]=temp;

		gpucov[p*band+bidx]=temp;

	}

}

else if (p>bidx)

{

	int j=band-1-p;

	int k=band-1-bidx;

	while(tid<imagesize)

	{

		temp0=gpurawdata[j*imagesize+tid];

		temp1=gpurawdata[k*imagesize+tid];

		x1+=temp0;

		y1+=temp1;

		mul1+=temp0*temp1;

		tid+=size;

	}

	x[tidx]=x1;

	y[tidx]=y1;

	mul[tidx]=mul1;

	__syncthreads();

	unsigned int i=blockDim.x/2;

	while (i!=0)

	{

		if (tidx<i)

		{

			x[tidx]+=x[tidx+i];

			y[tidx]+=y[tidx+i];

			mul[tidx]+=mul[tidx+i];

		}

		__syncthreads();	

		i/=2;

	}

	if (tidx==0)

	{

		float img=imagesize;

		float temp=(mul[0]-x[0]*y[0]/img)/(img-1);

		gpucov[j*band+k]=temp;

		gpucov[k*band+j]=temp;

	}

}

}

void getcov(unsigned char *rawdata, float cov, long imagesize)

{

printf("GPU²¢ÐÐʵÏÖ£º\n");

//InitCUDA();



int count; 

cudaGetDeviceCount(&count); 

if(count == 0) { 

	printf("There is no device.\n"); 

// return false;

} 

printf("there are %d GPUs!\n",count);

// omp_set_num_threads(count);

printf("num_threads=%d\n",omp_get_max_threads());

float *buffer[8];

for (int i=0;i<count;i++)

{

	buffer[i]=(float *)malloc(sizeof(float)*band*band);

}

#pragma omp parallel

{

	cudaDeviceProp prop; 

	int i,j,k,n;

	int gpuid=omp_get_thread_num();

	if (gpuid>=count)

	{

		printf("error:\tcpu threads is %d.gpu num is %d",omp_get_num_threads(),count);exit(1);

	}

	if(cudaGetDeviceProperties(&prop, gpuid) == cudaSuccess) 

	{ 

		if(prop.major >= 1) 

		{

			printf("GPU %d : %s\n",gpuid,prop.name); 

		} 

	}
	struct timeval tv0,tv1;
	struct timezone tz0,tz1;
	gettimeofday(&tv0,&tz0);

	cudaSetDevice(gpuid);

	cudaGetDevice(&gpuid);

	float *gpucov,*gpucov1;

	unsigned char *gpurawdata;

	cudaMalloc((void**) &gpurawdata, sizeof(char) *imagesize*band);

	cudaMalloc((void**) &gpucov,sizeof(float) *band*band);

	cudaMemcpy(gpurawdata,rawdata,sizeof(char)*imagesize*band,cudaMemcpyHostToDevice);

	if (0==gpuid)

	{

		duijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize);

	}

	else

	{

		gpucovto0<<<band,band>>>(gpucov);

	}



	if (gpuid != (count-1))

	{

		n=(gpuid+1)*(band/2/count);

	}

	else

	{

		n=band/2;

	}

printf(“thread %d:%d to %d\n”,gpuid,gpuid*(band/2/count),n);

	for (j=gpuid*(band/2/count); j<n; j++)

	{

		feiduijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize,j);

	}

//#pragma omp barrier

	cudaMemcpy(buffer[gpuid],gpucov,sizeof(float)*band*band,cudaMemcpyDeviceToHost);
cudaFree(gpucov);
cudaFree(gpurawdata);

// cudaDeviceReset();
gettimeofday(&tv1,&tz1);
printf(“thread %d : %d微妙\n”,gpuid,1000000 * (tv1.tv_sec - tv0.tv_sec) + tv1.tv_usec - tv0.tv_usec);

	if (0==gpuid)

	{

		for (i=0;i<band;i++)

		{

			cov[i*band+i]=buffer[0][i*band+i];

		}

	}

	for (j=gpuid*(band/2/count); j<n; j++)

	{

		for (i=0;i<band;i++)

		{

			if (j<i)

			{

				float temp=buffer[gpuid][i*band+j];

				cov[j*band+i]=temp;

				cov[i*band+j]=temp;

			}

			else if (j>i)

			{

				int i1=band-1-i;

				int j1=band-1-j;

				float temp=buffer[gpuid][i1*band+j1];

				cov[j1*band+i1]=temp;

				cov[i1*band+j1]=temp;

			}

		}

	}

}

for (int i=54;i<59;i++)

{

	for (int j=54;j<59;j++)

	{

		printf("%f\t",cov[i*band+j]);

	}

	printf("\n");

}

/*

float *gpucov;

unsigned char *gpurawdata;

cudaMalloc((void**) &gpurawdata, sizeof(char) *imagesize*band);

cudaMalloc((void**) &gpucov,sizeof(float) *band*band);

cudaMemcpy(gpurawdata,rawdata,sizeof(char)*imagesize*band,cudaMemcpyHostToDevice);

duijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize);

int band2=band/2;

for (i=0; i<band2; i++)

{

	feiduijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize,i);

}

cudaMemcpy(cov,gpucov,sizeof(float)*band*band,cudaMemcpyDeviceToHost);

*/

/*

for (i=0; i<5; i++)

{

	for (j=0; j<5; j++)

	{

		printf("%f\t",cov[i*band+j]);

	}

	printf("\n");

}

*/

/*

double m,n;

for (j=0; j<band2; j++)

{

	for (k=0; k<band; k++)

	{

		if (j<k)

		{

			double x=0;

			double y=0;

			double mul=0;

			for(i=0;i<imagesize;i++)

			{

				double m=rawdata[k*imagesize+i];

				double n=rawdata[j*imagesize+i];

				x+=m;

				y+=n;

				mul+=m*n;

			}

			cov[k*band+j]=(mul-x*y/imagesize)/(imagesize-1);

			cov[j*band+k]=cov[k*band+j];

		}

		else if(j>k)

		{

			double x=0;

			double y=0;

			double mul=0;

			int k1=band-1-k;

			int j1=band-1-j;

			for(i=0;i<imagesize;i++)

			{

				double m=rawdata[k1*imagesize+i];

				double n=rawdata[j1*imagesize+i];

				x+=m;

				y+=n;

				mul+=m*n;

			}

			cov[k1*band+j1]=(mul-x*y/imagesize)/(imagesize-1);

			cov[j1*band+k1]=cov[k1*band+j1];

		}

	}

}

*/

/*

for(k=band-1;k>=0;k--){

	for(j=0;j<band;j++)

	{

		if(j<=k)

		{

			double x=0;

			double y=0;

			double mul=0;

			for(i=0;i<imagesize;i++)

			{

				m=rawdata[k][i];

				n=rawdata[j][i];

				x+=m;

				y+=n;

				mul+=m*n;

			}

			cov[k*band+j]=(mul-x*y/imagesize)/(imagesize-1);

		}

		else

			cov[k*band+j]=cov[j*band+k];

	}

}

*/

}

楼主您好,这是因为runtime要初始化导致的延时。

建议:为了保证测试的公平性,请2个线程分别在测试前cudaSetDevice(0)和(1)一次,并分别执行一个空kernel一次。

然后请楼主重新观察您自己的结果。

cudaSetDevice的时间很短,两个基本一致的。

测出来是分配显存的时候时间产生差异,大概0.2s左右

楼主依然没能认真看我的回复,

如同我在2#所说,这是初始化所用的时间。

以及我重复一下2#的建议,希望这次你能采纳,
每个线程:
cudaSetDevice(1 or 2);
空kernel<<<>>>();

刚刚测试了,版主说的是对的

恭喜楼主。

并建议楼主以后不假思索的采纳我的建议,因为:
(1)您对我的直接信任可以大幅度的减少您的工作时间!
(2)您对我的直接信任可以大幅度的提高您的工作效率!

感谢您的来访!