为什么第二个GPU花费比第一个多了近200ms
通过隐藏gpu的操作分析,在第一步的cudamalloc中就出现了这个差异。
求解决方法。
不然结果跟一个GPU的时间一样了,没加速效果了都
——————————————————
#include <stdio.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <omp.h>
#define band 224
#define thread_num 256
bool InitCUDA() {
int count;
cudaGetDeviceCount(&count);
if(count == 0) {
printf("There is no device.\n");
return false;
}
printf("there are %d GPUs!\n",count);
int i;
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {printf("GPU %d : %s\n",i,prop.name); break; }
}
}
cudaSetDevice(i);
return true;
}
global void gpucovto0(float *gpucov)//covž³0
{
const int bidx=blockIdx.x;
const int tidx=threadIdx.x;
gpucov[bidx*band+tidx]=0;
}
global void duijiao(unsigned char *gpurawdata, float *gpucov, long imagesize)
{
__shared__ float x[thread_num],mul[thread_num];
const int bidx=blockIdx.x;
const int tidx=threadIdx.x;
const int size=blockDim.x;
if (tidx<band)
{
gpucov[bidx*band+tidx]=0;
}
unsigned int tid=threadIdx.x;
float x1=0;float mul1=0;
float temp;
while(tid<imagesize)
{
temp=gpurawdata[bidx*imagesize+tid];
x1+=temp;
mul1+=temp*temp;
tid+=size;
}
x[tidx]=x1;
mul[tidx]=mul1;
__syncthreads();
unsigned int i=blockDim.x/2;
while (i!=0)
{
if (tidx<i)
{
x[tidx]+=x[tidx+i];
mul[tidx]+=mul[tidx+i];
}
__syncthreads();
i/=2;
}
if (tidx==0)
{
float img=imagesize;
gpucov[bidx*band+bidx]=(mul[0]-x[0]*x[0]/img)/(img-1);
}
}
global void feiduijiao(unsigned char *gpurawdata, float *gpucov, long imagesize, int p)
{
__shared__ float x[thread_num],y[thread_num],mul[thread_num];
const int bidx=blockIdx.x;
const int tidx=threadIdx.x;
const int size=blockDim.x;
unsigned int tid=threadIdx.x;
float x1=0,y1=0,mul1=0;
float temp0,temp1;
if (p<bidx)
{
while(tid<imagesize)
{
temp0=gpurawdata[p*imagesize+tid];
temp1=gpurawdata[bidx*imagesize+tid];
x1+=temp0;
y1+=temp1;
mul1+=temp0*temp1;
tid+=size;
}
x[tidx]=x1;
y[tidx]=y1;
mul[tidx]=mul1;
__syncthreads();
unsigned int i=blockDim.x/2;
while (i!=0)
{
if (tidx<i)
{
x[tidx]+=x[tidx+i];
y[tidx]+=y[tidx+i];
mul[tidx]+=mul[tidx+i];
}
__syncthreads();
i/=2;
}
if (tidx==0)
{
float img=imagesize;
float temp=(mul[0]-x[0]*y[0]/img)/(img-1);
gpucov[bidx*band+p]=temp;
gpucov[p*band+bidx]=temp;
}
}
else if (p>bidx)
{
int j=band-1-p;
int k=band-1-bidx;
while(tid<imagesize)
{
temp0=gpurawdata[j*imagesize+tid];
temp1=gpurawdata[k*imagesize+tid];
x1+=temp0;
y1+=temp1;
mul1+=temp0*temp1;
tid+=size;
}
x[tidx]=x1;
y[tidx]=y1;
mul[tidx]=mul1;
__syncthreads();
unsigned int i=blockDim.x/2;
while (i!=0)
{
if (tidx<i)
{
x[tidx]+=x[tidx+i];
y[tidx]+=y[tidx+i];
mul[tidx]+=mul[tidx+i];
}
__syncthreads();
i/=2;
}
if (tidx==0)
{
float img=imagesize;
float temp=(mul[0]-x[0]*y[0]/img)/(img-1);
gpucov[j*band+k]=temp;
gpucov[k*band+j]=temp;
}
}
}
void getcov(unsigned char *rawdata, float cov, long imagesize)
{
printf("GPU²¢ÐÐʵÏÖ£º\n");
//InitCUDA();
int count;
cudaGetDeviceCount(&count);
if(count == 0) {
printf("There is no device.\n");
// return false;
}
printf("there are %d GPUs!\n",count);
// omp_set_num_threads(count);
printf("num_threads=%d\n",omp_get_max_threads());
float *buffer[8];
for (int i=0;i<count;i++)
{
buffer[i]=(float *)malloc(sizeof(float)*band*band);
}
#pragma omp parallel
{
cudaDeviceProp prop;
int i,j,k,n;
int gpuid=omp_get_thread_num();
if (gpuid>=count)
{
printf("error:\tcpu threads is %d.gpu num is %d",omp_get_num_threads(),count);exit(1);
}
if(cudaGetDeviceProperties(&prop, gpuid) == cudaSuccess)
{
if(prop.major >= 1)
{
printf("GPU %d : %s\n",gpuid,prop.name);
}
}
struct timeval tv0,tv1;
struct timezone tz0,tz1;
gettimeofday(&tv0,&tz0);
cudaSetDevice(gpuid);
cudaGetDevice(&gpuid);
float *gpucov,*gpucov1;
unsigned char *gpurawdata;
cudaMalloc((void**) &gpurawdata, sizeof(char) *imagesize*band);
cudaMalloc((void**) &gpucov,sizeof(float) *band*band);
cudaMemcpy(gpurawdata,rawdata,sizeof(char)*imagesize*band,cudaMemcpyHostToDevice);
if (0==gpuid)
{
duijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize);
}
else
{
gpucovto0<<<band,band>>>(gpucov);
}
if (gpuid != (count-1))
{
n=(gpuid+1)*(band/2/count);
}
else
{
n=band/2;
}
printf(“thread %d:%d to %d\n”,gpuid,gpuid*(band/2/count),n);
for (j=gpuid*(band/2/count); j<n; j++)
{
feiduijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize,j);
}
//#pragma omp barrier
cudaMemcpy(buffer[gpuid],gpucov,sizeof(float)*band*band,cudaMemcpyDeviceToHost);
cudaFree(gpucov);
cudaFree(gpurawdata);
// cudaDeviceReset();
gettimeofday(&tv1,&tz1);
printf(“thread %d : %d微妙\n”,gpuid,1000000 * (tv1.tv_sec - tv0.tv_sec) + tv1.tv_usec - tv0.tv_usec);
if (0==gpuid)
{
for (i=0;i<band;i++)
{
cov[i*band+i]=buffer[0][i*band+i];
}
}
for (j=gpuid*(band/2/count); j<n; j++)
{
for (i=0;i<band;i++)
{
if (j<i)
{
float temp=buffer[gpuid][i*band+j];
cov[j*band+i]=temp;
cov[i*band+j]=temp;
}
else if (j>i)
{
int i1=band-1-i;
int j1=band-1-j;
float temp=buffer[gpuid][i1*band+j1];
cov[j1*band+i1]=temp;
cov[i1*band+j1]=temp;
}
}
}
}
for (int i=54;i<59;i++)
{
for (int j=54;j<59;j++)
{
printf("%f\t",cov[i*band+j]);
}
printf("\n");
}
/*
float *gpucov;
unsigned char *gpurawdata;
cudaMalloc((void**) &gpurawdata, sizeof(char) *imagesize*band);
cudaMalloc((void**) &gpucov,sizeof(float) *band*band);
cudaMemcpy(gpurawdata,rawdata,sizeof(char)*imagesize*band,cudaMemcpyHostToDevice);
duijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize);
int band2=band/2;
for (i=0; i<band2; i++)
{
feiduijiao<<<band,thread_num>>>(gpurawdata,gpucov,imagesize,i);
}
cudaMemcpy(cov,gpucov,sizeof(float)*band*band,cudaMemcpyDeviceToHost);
*/
/*
for (i=0; i<5; i++)
{
for (j=0; j<5; j++)
{
printf("%f\t",cov[i*band+j]);
}
printf("\n");
}
*/
/*
double m,n;
for (j=0; j<band2; j++)
{
for (k=0; k<band; k++)
{
if (j<k)
{
double x=0;
double y=0;
double mul=0;
for(i=0;i<imagesize;i++)
{
double m=rawdata[k*imagesize+i];
double n=rawdata[j*imagesize+i];
x+=m;
y+=n;
mul+=m*n;
}
cov[k*band+j]=(mul-x*y/imagesize)/(imagesize-1);
cov[j*band+k]=cov[k*band+j];
}
else if(j>k)
{
double x=0;
double y=0;
double mul=0;
int k1=band-1-k;
int j1=band-1-j;
for(i=0;i<imagesize;i++)
{
double m=rawdata[k1*imagesize+i];
double n=rawdata[j1*imagesize+i];
x+=m;
y+=n;
mul+=m*n;
}
cov[k1*band+j1]=(mul-x*y/imagesize)/(imagesize-1);
cov[j1*band+k1]=cov[k1*band+j1];
}
}
}
*/
/*
for(k=band-1;k>=0;k--){
for(j=0;j<band;j++)
{
if(j<=k)
{
double x=0;
double y=0;
double mul=0;
for(i=0;i<imagesize;i++)
{
m=rawdata[k][i];
n=rawdata[j][i];
x+=m;
y+=n;
mul+=m*n;
}
cov[k*band+j]=(mul-x*y/imagesize)/(imagesize-1);
}
else
cov[k*band+j]=cov[j*band+k];
}
}
*/
}