openMP效能更差?

参考Ping-Che Chen 改成openMP的版本,在执行之后会比单线程版
本更慢,不太懂原因,希望大家可以指点迷津
执行程序加入 -n 20 -s 1000为范例执行20次size 1000的矩阵相乘
p.s. 我的测试装置是C1070

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE      16
unsigned int cpu_thread_id;
unsigned int num_cpu_threads;
int num_gpus=0;
int gpu_id=-1;
int count=0;
__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{

   __shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
   __shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
   const int tidc = threadIdx.x;
   const int tidr = threadIdx.y;
   const int bidc = blockIdx.x * BLOCK_SIZE;
   const int bidr = blockIdx.y * BLOCK_SIZE;
   int i, j;

   float results = 0;
   float comp = 0;

   for(j = 0; j < n; j += BLOCK_SIZE) {
   matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
   matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];

   __syncthreads();

   for(i = 0; i < BLOCK_SIZE; i++) {
   float t;
   comp -= matA[tidr][i] * matB[i][tidc];
   t = results - comp;
   comp = (t - results) + comp;
   results = t;
   }

   __syncthreads();
   }

   c[(tidr + bidr) * ldc + tidc + bidc] = results;
}



void matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
   count++;
   float *ac, *bc, *cc;
   size_t pitch_a, pitch_b, pitch_c;
   int newn = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
   cudaMallocPitch((void**) &ac, &pitch_a, sizeof(float) * newn, newn);
   cudaMallocPitch((void**) &bc, &pitch_b, sizeof(float) * newn, newn);
   cudaMallocPitch((void**) &cc, &pitch_c, sizeof(float) * newn, newn);
   cudaMemset(ac, 0, pitch_a * newn);
   cudaMemset(bc, 0, pitch_b * newn);
   cudaMemcpy2D(ac, pitch_a, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice);
   cudaMemcpy2D(bc, pitch_b, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice);
   int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
   dim3 blocks(bx, bx);
   dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
   matMultCUDA<<<blocks, threads>>>(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);
   cudaMemcpy2D(c, sizeof(float) * ldc, cc, pitch_c, sizeof(float) * n, n, cudaMemcpyDeviceToHost);
   cudaFree(ac);
   cudaFree(bc);
   cudaFree(cc);

}

void matgen(float* a, int lda, int n)
{
   int i, j;
   for(i = 0; i < n; i++) {
   for(j = 0; j < n; j++) {
   a[i * lda + j] = (float) rand() / RAND_MAX + (float) rand() / (RAND_MAX * RAND_MAX);
   }
   }
}

int main(int argc,char *argv[]){
   int size;
   int loop;
   cudaGetDeviceCount(&num_gpus);
   if(num_gpus < 1){
   printf("no CUDA capable devices were detected\n");
   return 1;
   }
   if(argc==5&&!strcmp(argv[1],"-n")&&!strcmp(argv[3],"-s")){
   loop=atoi(argv[2]);
   size=atoi(argv[4]);
   omp_set_num_threads(num_gpus);  // create as many CPU threads as there are CUDA devices
   printf("---------------------------\n");
   }else{
   printf("parameter error ");
   return 0;
   }
   num_cpu_threads = omp_get_num_threads();
   printf("number of host CPUs:\t%d\n", omp_get_num_procs());
   printf("number of CUDA devices:\t%d\n", num_gpus);
   
   int i;
   clock_t start, end;
   start=clock();
   #pragma omp parallel for num_threads(num_gpus)
   for(i = 0; i < loop; i++ ){
   cpu_thread_id=omp_get_thread_num();
   cudaSetDevice(cpu_thread_id % num_gpus);    // "% num_gpus" allows more CPU threads than GPU devices
   cudaGetDevice(&gpu_id);
   printf("The %i time(s) mul,CPU thread %d (of %d) uses CUDA device %d\n",i, omp_get_thread_num(), omp_get_num_threads(), gpu_id);
   int n=size;
   float *a, *b, *c, *d;
   // set and check the CUDA device for this CPU thread
   a = (float*) malloc(sizeof(float) * n * n);
   b = (float*) malloc(sizeof(float) * n * n);
   c = (float*) malloc(sizeof(float) * n * n);
   d = (float*) malloc(sizeof(float) * n * n);
   srand(0);
   matgen(a, n, n);
   matgen(b, n, n);
   matmultCUDA(a, n, b, n, c, n, n);
   free(a);
   free(b);
   free(d);
   free(c);
   }//*/
   end = clock();
   double sec = (double) (end-start) / CLOCKS_PER_SEC;
   printf("count %i Time used: %.4lf   (%.2lf GFLOPS)\n", count,sec, 2.0 * loop*size * size * size/ (sec * 1E9));
   cudaThreadExit();
   return 0;
}

你有几个GPU?