囧。。果然还有问题。。哎。。版主厉害啊。。后面用了一个800480和4801200的矩阵相乘,同样把矩阵值设置成1,这次结果倒是480,但是这里在CPU上统计的时间是1141.11ms,而核函数统计的时间是0.0245755ms,两个时间不能相差这么多啊。代码如下:
这个是主函数:
void CDICView::OnMatrixmulGpu()
{
// TODO: Add your command handler code here
//////////////////////////////////////////////////////////////////////////
//给CPU分配空间并赋值
int i,j;
int BLOCK_SIZE=16;
int aw=30*BLOCK_SIZE;
int ah=50*BLOCK_SIZE;
int bw=80*BLOCK_SIZE;
int bh=aw;
int cw=bw;
int ch=ah;
float *Matrix_ha=(float*)malloc(aw*ah*sizeof(float)); //尽量用c语言,这样通用性比较强
float *Matrix_hb=(float*)malloc(bw*bh*sizeof(float));
float *Matrix_hc=(float*)malloc(cw*ch*sizeof(float));
for (i=0;i<ah;i++){ //a
for (j=0;j<aw;j++){
Matrix_ha[i*aw+j]=1/*(float)rand()/RAND_MAX*/;
}
}
for (i=0;i<bh;i++){ //b
for (j=0;j<bw;j++){
Matrix_hb[i*bw+j]=1/*(float)rand()/RAND_MAX*/;
}
}
memset(Matrix_hc,0,cw*ch*sizeof(float)); //c
ofstream cpu_time("cputim.txt");
//计算CPU上的数据和计算时间
_LARGE_INTEGER cpu_start_time;
_LARGE_INTEGER cpu_end_time;
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
double dqFreq=(double)f.QuadPart;
QueryPerformanceCounter(&cpu_start_time);
double s1;
for (i=0;i<ch;i++){
for (j=0;j<cw;j++){
s1=0;
for (int k=0;k<aw;k++){
s1=s1+Matrix_ha[i*aw+k]*Matrix_hb[k*bw+j];
}
Matrix_hc[i*cw+j]=s1;
}
}
QueryPerformanceCounter(&cpu_end_time);
cpu_time<<"计算时间为:"<<(cpu_end_time.QuadPart-cpu_start_time.QuadPart)/dqFreq*1000<<"ms"<<endl;
for (i=0;i<ch;i++){
for (j=0;j<cw;j++){
cpu_time<<Matrix_hc[i*cw+j]<<" ";
}
cpu_time<<endl;
}
AfxMessageBox("CPU计算完成!");
//////////////////////////////////////////////////////////////////////////
//GPU
MatrixMul(Matrix_hc,Matrix_ha,Matrix_hb);
//交出空间
free(Matrix_ha);
free(Matrix_hb);
free(Matrix_hc);
AfxMessageBox("GPU计算完成!");
}
下面是调用函数的:
void MatrixMul(float *Matrix_hc,const float *Matrix_ha,const float *Matrix_hb)
{
int i,j;
int BLOCK_SIZE=16;
int aw=30*BLOCK_SIZE;
int ah=50*BLOCK_SIZE;
int bw=80*BLOCK_SIZE;
int bh=aw;
int cw=bw;
int ch=ah;
float *Matrix_da,*Matrix_db,*Matrix_dc;
ofstream MatrixSub("MatrixSub.txt");
checkCudaErrors(cudaMalloc((void**)&Matrix_da,aw*ah*sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&Matrix_db,bw*bh*sizeof(float)));
checkCudaErrors(cudaMalloc((void**)&Matrix_dc,cw*ch*sizeof(float)));
checkCudaErrors(cudaMemcpy(Matrix_da,Matrix_ha,aw*ah*sizeof(float),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(Matrix_db,Matrix_hb,bw*bh*sizeof(float),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(Matrix_dc,0,cw*ch*sizeof(float)));
dim3 BKSize((cw+BLOCK_SIZE-1)/BLOCK_SIZE,(ch+BLOCK_SIZE-1)/BLOCK_SIZE);
dim3 THSize(BLOCK_SIZE,BLOCK_SIZE);
_LARGE_INTEGER gpu_start_time,gpu_end_time;
LARGE_INTEGER f;
QueryPerformanceFrequency(&f);
double dqFreq=(double)f.QuadPart;
QueryPerformanceCounter(&gpu_start_time);
MatrixMul_Kernel<<<BKSize,THSize>>>(Matrix_dc,Matrix_da,Matrix_db,aw,cw);
QueryPerformanceCounter(&gpu_end_time);
MatrixSub<<"计算时间为:"<<(gpu_end_time.QuadPart-gpu_start_time.QuadPart)/dqFreq*1000<<"ms"<<endl;
//传值回来
float* Matrix_hc1=(float*)malloc(cw*ch*sizeof(float));
checkCudaErrors(cudaMemcpy(Matrix_hc1,Matrix_dc,cw*ch*sizeof(float),cudaMemcpyDeviceToHost));
//验证值
MatrixSub<<"CPU计算值为:"<<endl;
for (i=0;i<ch;i++){
for (j=0;j<cw;j++){
MatrixSub<<(Matrix_hc[i*cw+j])<<" ";
}
MatrixSub<<endl;
}
MatrixSub<<"GPU计算值为:"<<endl;
for (i=0;i<ch;i++){
for (j=0;j<cw;j++){
MatrixSub<<(Matrix_hc1[i*cw+j])<<" ";
}
MatrixSub<<endl;
}
MatrixSub<<"差值为:"<<endl;
for (i=0;i<ch;i++){
for (j=0;j<cw;j++){
MatrixSub<<(Matrix_hc1[i*cw+j]-Matrix_hc[i*cw+j])<<" ";
}
MatrixSub<<endl;
}
//释放空间
checkCudaErrors(cudaFree(Matrix_da));
checkCudaErrors(cudaFree(Matrix_db));
checkCudaErrors(cudaFree(Matrix_dc));
free(Matrix_hc1);
}
下面这个是核函数:
__global__ void MatrixMul_Kernel(float *dst_c,const float* src_a,const float* src_b,int width_a,int width_c)
{ //用的是2D的,不然不好分块
int BLOCK_SIZE=16;
int bidx=blockIdx.x;
int bidy=blockIdx.y;
int tidx=threadIdx.x;
int tidy=threadIdx.y;
int i,j;
int n=(width_a+BLOCK_SIZE-1)/BLOCK_SIZE; //循环次数
__shared__ float As[16][16];
__shared__ float Bs[16][16];
for (i=0;i<n;i++){
As[tidy][tidx]=src_a[(bidy*blockDim.y+tidy)*width_a+i*BLOCK_SIZE+tidx]; //y一样,x(0,width)
Bs[tidy][tidx]=src_b[(i*BLOCK_SIZE+tidy)*width_c+bidx*blockDim.x+tidx]; //x一样,y(0,width)
__syncthreads();
for (j=0;j<BLOCK_SIZE;j++){
dst_c[(bidy*blockDim.y+tidy)*width_c+bidx*blockDim.x+tidx]+=As[tidy][j]*Bs[j][tidx];
}
__syncthreads();
}
}
为什么两个统计的时间相差这么多呢?怎么感觉都不是很正常。。谢谢