我仿照例子程序写了一个1024*1024两个矩阵相乘的程序,为了相互比对,我分别在CPU上计算一次,在GPU上计算一次,用GPU的CUBLAS库计算一次.为了避免不必要的麻烦,我都用的FLOAT. 这三者输出应该一样,或者相应不大.
我原来环境为 WINXP VS2010 CPU AMD X4 635 GPU GT240 …结果相差很小,都在小数点后面.
目前环境SLES X64 SERVER CPU I7 3770K GPU GTX660 ,结果相差较大,请问问题出在哪里?
代码如下:
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <assert.h>
#include <pthread.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <device_launch_parameters.h>
#define TILE_WIDTH 16
#define MAX_DIM 1024
float MatrixA[MAX_DIM][MAX_DIM];
float MatrixB[MAX_DIM][MAX_DIM];
float MatrixC[MAX_DIM][MAX_DIM];
/* 设置矩阵内容 */
void FillMatrix()
{
register int i, j;
srand( ( unsigned int )time( NULL ) );
for ( i = 0; i < MAX_DIM; i ++ )
{
for ( j = 0; j < MAX_DIM; j ++ )
{
MatrixA[i][j] = ( float )rand() * ((float)rand() / 10000) / RAND_MAX;
MatrixB[i][j] = ( float )rand() * ((float)rand() / 10000) / RAND_MAX;
}
}
}
/********************************************************************/
/* 运行在CPU上,最笨的方法 */
void RunOnCPU()
{
float sum;
register int i, j, k;
for ( i = 0; i < MAX_DIM; ++ i )
{
for ( j = 0; j < MAX_DIM; ++ j )
{
sum = 0;
for ( k = 0; k < MAX_DIM; ++ k )
{
sum += MatrixA[i][k] * MatrixB[k][j];
}
MatrixC[i][j] = sum;
}
}
}
/********************************************************************/
/* 运行在GPU上 */
__global__ void Matrix_Mul1( float* c, const float* a, const float* b )
{
unsigned int i, j, bx, by, tx, ty;
float mulResult;
__shared__ float d_m[TILE_WIDTH][TILE_WIDTH];
__shared__ float d_n[TILE_WIDTH][TILE_WIDTH];
bx = blockIdx.x;
by = blockIdx.y;
tx = threadIdx.x;
ty = threadIdx.y;
mulResult = 0.0f;
for ( i = 0; i < gridDim.x; ++i )
{
d_m[ty][tx] = *( a + ( by * blockDim.y + ty ) * MAX_DIM + i * blockDim.x + tx );
d_n[ty][tx] = *( b + ( i * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx );
__syncthreads();
for ( j = 0; j < blockDim.x; ++ j )
{
mulResult += d_m[ty][j] * d_n[j][tx];
}
__syncthreads();
}
c[( by * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx] = mulResult;
}
void MatrixMul1( float* c, const float* a, const float* b )
{
int cnt;
float* dev_a;
float* dev_b;
float* dev_c;
cudaError_t cudaStatus;
// 64 * 64 ====> 16 * 16
dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
dim3 blocks( TILE_WIDTH, TILE_WIDTH );
cnt = MAX_DIM * MAX_DIM;
dev_a = NULL;
dev_b = NULL;
dev_c = NULL;
/* 设置显卡,构建上下文 */
cudaStatus = cudaSetDevice( 0 );
assert( cudaStatus == cudaSuccess );
/* 分配显存 */
cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
/* 内存传送数据到显存 */
cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
assert( cudaStatus == cudaSuccess );
/* 调用显卡 */
Matrix_Mul1 <<< grid, blocks >>> ( dev_c, dev_a, dev_b );
/* 设备同步 */
cudaStatus = cudaDeviceSynchronize();
assert( cudaStatus == cudaSuccess );
/* 结果从显存传送到内存 */
cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
assert( cudaStatus == cudaSuccess );
/* 释放显存 */
cudaFree( dev_c );
cudaFree( dev_a );
cudaFree( dev_b );
/* 重启显卡(上下文) */
cudaDeviceReset();
}
/********************************************************************/
/* 使用CUBLAS库 */
void MatrixMul2( float* c, const float* a, const float* b )
{
int cnt;
float* dev_a;
float* dev_b;
float* dev_c;
cublasHandle_t handle;
cublasStatus_t cuBlasStatus;
cudaError_t cudaStatus;
float alpha;
float beta;
// 64 * 64 ====> 16 * 16
dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH );
dim3 blocks( TILE_WIDTH, TILE_WIDTH );
dev_a = NULL;
dev_b = NULL;
dev_c = NULL;
cnt = MAX_DIM * MAX_DIM;
alpha = 1.0f;
beta = 0.0f;
/* 设置显卡,构建上下文 */
cudaStatus = cudaSetDevice( 0 );
assert( cudaStatus == cudaSuccess );
/* 初始化BLAS库 */
cuBlasStatus = cublasCreate( &handle );
assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
/* 分配显存 */
cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) );
assert( cudaStatus == cudaSuccess );
/* 内存传送数据到显存 */
cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice );
assert( cudaStatus == cudaSuccess );
cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice );
assert( cudaStatus == cudaSuccess );
/* 处理 */
cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
/* 处理 */
cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \
MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \
dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM );
assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
/* 结果从显存传送到内存 */
cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost );
assert( cudaStatus == cudaSuccess );
/* 销毁BLAS */
cuBlasStatus = cublasDestroy( handle );
assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS );
/* 重启显卡(上下文) */
cudaDeviceReset();
}
/********************************************************************/
int main()
{
struct timeval tm1, tm2;
printf("short:%d\tint:%d\tlong:%d\tfloat:%d\tdouble:%d\n", sizeof(short), sizeof(int), sizeof(long), sizeof(float), sizeof(double));
FillMatrix();
memset( MatrixC, 0, sizeof( MatrixC ) );
gettimeofday(&tm1, NULL);
RunOnCPU();
gettimeofday(&tm2, NULL);
printf( "%f, %f, %f, %f\nTIMES: %f\n\n", MatrixC[0][0], MatrixC[37][71], MatrixC[MAX_DIM - 1][MAX_DIM - 1], MatrixC[217][13],(tm2.tv_sec - tm1.tv_sec) * 1000 + (float)(tm2.tv_usec - tm1.tv_usec) / 1000 );
memset( MatrixC, 0, sizeof( MatrixC ) );
gettimeofday(&tm1, NULL);
MatrixMul1( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
gettimeofday(&tm2, NULL);
printf( "%f, %f, %f, %f\nTIMES: %f\n\n", MatrixC[0][0], MatrixC[37][71], MatrixC[MAX_DIM - 1][MAX_DIM - 1], MatrixC[217][13],(tm2.tv_sec - tm1.tv_sec) * 1000 + (float)(tm2.tv_usec - tm1.tv_usec) / 1000 );
memset( MatrixC, 0, sizeof( MatrixC ) );
gettimeofday(&tm1, NULL);
MatrixMul2( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB );
gettimeofday(&tm2, NULL);
printf( "%f, %f, %f, %f\nTIMES: %f\n\n", MatrixC[0][0], MatrixC[37][71], MatrixC[MAX_DIM - 1][MAX_DIM - 1], MatrixC[217][13],(tm2.tv_sec - tm1.tv_sec) * 1000 + (float)(tm2.tv_usec - tm1.tv_usec) / 1000 );
return 0;
}
结果如下
short:2 int:4 long:8 float:4 double:8
2921729884160.000000, 2847927435264.000000, 3035846410240.000000, 2896220389376.000000
TIMES: 2813.224121
2921730146304.000000, 2847927435264.000000, 3035846410240.000000, 2896220389376.000000
TIMES: 207.865005
2921730146304.000000, 2847927435264.000000, 3035846410240.000000, 2896220389376.000000
TIMES: 174.639008
请问是什么原因?
谢谢了.