参考Ping-Che Chen 改成openMP的版本,在执行之后会比单线程版
本更慢,不太懂原因,希望大家可以指点迷津
执行程序加入 -n 20 -s 1000为范例执行20次size 1000的矩阵相乘
p.s. 我的测试装置是C1070
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
unsigned int cpu_thread_id;
unsigned int num_cpu_threads;
int num_gpus=0;
int gpu_id=-1;
int count=0;
__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
__shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
const int tidc = threadIdx.x;
const int tidr = threadIdx.y;
const int bidc = blockIdx.x * BLOCK_SIZE;
const int bidr = blockIdx.y * BLOCK_SIZE;
int i, j;
float results = 0;
float comp = 0;
for(j = 0; j < n; j += BLOCK_SIZE) {
matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];
__syncthreads();
for(i = 0; i < BLOCK_SIZE; i++) {
float t;
comp -= matA[tidr][i] * matB[i][tidc];
t = results - comp;
comp = (t - results) + comp;
results = t;
}
__syncthreads();
}
c[(tidr + bidr) * ldc + tidc + bidc] = results;
}
void matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
count++;
float *ac, *bc, *cc;
size_t pitch_a, pitch_b, pitch_c;
int newn = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
cudaMallocPitch((void**) &ac, &pitch_a, sizeof(float) * newn, newn);
cudaMallocPitch((void**) &bc, &pitch_b, sizeof(float) * newn, newn);
cudaMallocPitch((void**) &cc, &pitch_c, sizeof(float) * newn, newn);
cudaMemset(ac, 0, pitch_a * newn);
cudaMemset(bc, 0, pitch_b * newn);
cudaMemcpy2D(ac, pitch_a, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, pitch_b, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice);
int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 blocks(bx, bx);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
matMultCUDA<<<blocks, threads>>>(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
cudaMemcpy2D(c, sizeof(float) * ldc, cc, pitch_c, sizeof(float) * n, n, cudaMemcpyDeviceToHost);
cudaFree(ac);
cudaFree(bc);
cudaFree(cc);
}
void matgen(float* a, int lda, int n)
{
int i, j;
for(i = 0; i < n; i++) {
for(j = 0; j < n; j++) {
a[i * lda + j] = (float) rand() / RAND_MAX + (float) rand() / (RAND_MAX * RAND_MAX);
}
}
}
int main(int argc,char *argv[]){
int size;
int loop;
cudaGetDeviceCount(&num_gpus);
if(num_gpus < 1){
printf("no CUDA capable devices were detected\n");
return 1;
}
if(argc==5&&!strcmp(argv[1],"-n")&&!strcmp(argv[3],"-s")){
loop=atoi(argv[2]);
size=atoi(argv[4]);
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
printf("---------------------------\n");
}else{
printf("parameter error ");
return 0;
}
num_cpu_threads = omp_get_num_threads();
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
int i;
clock_t start, end;
start=clock();
#pragma omp parallel for num_threads(num_gpus)
for(i = 0; i < loop; i++ ){
cpu_thread_id=omp_get_thread_num();
cudaSetDevice(cpu_thread_id % num_gpus); // "% num_gpus" allows more CPU threads than GPU devices
cudaGetDevice(&gpu_id);
printf("The %i time(s) mul,CPU thread %d (of %d) uses CUDA device %d\n",i, omp_get_thread_num(), omp_get_num_threads(), gpu_id);
int n=size;
float *a, *b, *c, *d;
// set and check the CUDA device for this CPU thread
a = (float*) malloc(sizeof(float) * n * n);
b = (float*) malloc(sizeof(float) * n * n);
c = (float*) malloc(sizeof(float) * n * n);
d = (float*) malloc(sizeof(float) * n * n);
srand(0);
matgen(a, n, n);
matgen(b, n, n);
matmultCUDA(a, n, b, n, c, n, n);
free(a);
free(b);
free(d);
free(c);
}//*/
end = clock();
double sec = (double) (end-start) / CLOCKS_PER_SEC;
printf("count %i Time used: %.4lf (%.2lf GFLOPS)\n", count,sec, 2.0 * loop*size * size * size/ (sec * 1E9));
cudaThreadExit();
return 0;
}