最近学习CUDA C的编程,在并行运行一个简单的解调算法的时候,统计时间后发现运行速度越来越慢(但还是运算结果正确的),后来简化到只运行其中一个核函数的时候,就算复杂度下降了,但还是会越跑越慢,尝试过每一轮都进行cudafree和cudamalloc也没用,这是为什么呢?
刚刚还发现了,对于我用过的一些矩阵加法、矩阵点乘的教程例子,加上for循环跑很多很多次,也是会出现这种越来越慢的情况。
环境:
win10 Visual studio2019 community
cuda 10.2
cudnn 7.6.5
用几年前的笔记本进行测试——显卡950M
:'(跪求各路大神救救了
简化后作为测试的代码如下:
#include <stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include "..\common\book.h"
#include <math.h>
#include <time.h>
#define BLOCK_NUM 16
#define THREAD_NUM 16
#define R_SIZE 256 //256
#define M_SIZE R_SIZE*R_SIZE //256*256
#define SNR_LEN 7
#define N 100
#define pi 3.1415926535
double SNR[SNR_LEN] = { 0,1,2,3,4,5,6 };
__global__ void DeModuate(double* ReceivedSignal_R, short int* demodSignal_HD, double sigma)
{
const int row = blockIdx.x * THREAD_NUM + threadIdx.x;
double P_temp0, P_temp1, demodSignal_LLR;
for (int i = 0; i < R_SIZE; i++)
{
demodSignal_LLR = 4 * ReceivedSignal_R[row * R_SIZE + i] / sigma;
demodSignal_HD[row * R_SIZE + i] = (demodSignal_LLR > 0 ? 1 : 0);
}
}
int main(int arc, char* argv[])
{
//GPU declaration
double * Dev_ReceivedSignal_R;
short int* Dev_demodSignal_HD;
//GPU malloc
HANDLE_ERROR(cudaMalloc((void**)&Dev_ReceivedSignal_R, sizeof(double) * M_SIZE));
HANDLE_ERROR(cudaMalloc((void**)&Dev_demodSignal_HD, sizeof(short int) * M_SIZE));
//time_counting declaration
clock_t start, end;
double DeMod_time = 0;
int DeMod_time_temp;
DeMod_time_temp = 0;
for (int s = 0; s < SNR_LEN; s++)
{
err_Uncoded = 0;
for (int frame = 0; frame < N; frame++)
{
start = clock();
DeModuate << <BLOCK_NUM, THREAD_NUM >> > (Dev_ReceivedSignal_R, Dev_demodSignal_HD, sigma);
end = clock();
DeMod_time_temp += (end - start);
DeMod_time = (double)DeMod_time_temp / CLK_TCK;
printf("SNR = %1.1f, %4d/%d sim finished, ori_err = %4d, time: %f\n", SNR[s], frame, N, err_Uncoded_temp, DeMod_time);
}
}
for (int s = 0; s < SNR_LEN; s++)
{
printf("SNR = %1.1f dB,BER_Uncoded= %1.10f;\n", SNR[s], BER_Uncoded[s]);
}
//GPU free
cudaFree(Dev_demodSignal_HD);
cudaFree(Dev_ReceivedSignal_R);
return 0;
}