#include <stdlib.h>
#include <stdio.h>
#include <windows.h>
#include<iostream>
#include <time.h>
#include <math.h>
#include <curand.h>
#include <curand_kernel.h>
#include <cuda_gl_interop.h>
#include <cuda_runtime.h>
using namespace std;
#define FeaturesNum1 5
#define THREDANUM1 32
#define M_PI1 3.14159265358979323846
#define PNum1 24
#define PDim1 17
__constant__ int pso_lanmark_GPU[FeaturesNum1][3];
__constant__ double Featuresinfo_GPU[FeaturesNum1][3];
__constant__ double alpas_GPU[FeaturesNum1][3][10];
__constant__ double pose_Xup_GPU[17];
__constant__ double pose_Xdown_GPU[17];
__constant__ double Vmax_GPU[17];
__device__ double myrand1(unsigned int &x,unsigned int &y,unsigned int &z,unsigned int &c,int tidx)
{
unsigned long long t, A = 698769069ULL;
x = 69069*x+1234+tidx*1000;
y ^= (y<<13+tidx);
y ^= (y>>17+tidx);
y ^= (y<<5+tidx);
t = (A*z + c);
c = (t >> 32);
z = t;
return (double)((x+y+z)/RAND_MAX)/RAND_MAX/4;
}
__global__ void ParticleFly_GPU(double* CudaDat ,double* data_out,int isize,int circulate)
{
__shared__ double Fit[PNum1];
__shared__ int GBestIndex ;
int tidx = threadIdx.x;
unsigned int x;
unsigned int y;
unsigned int z;
unsigned int c;
x=123456789;
y=362436000;
z=521288629;
c=7654321;
__shared__ double test[PNum1];
__shared__ double Wmax ;
__shared__ double Wmin ;
__shared__ double W;
__shared__ double C1;
__shared__ double C2;
__shared__ double optical_center[2];
Wmax = 0.9;
Wmin = 0.4;
W = 1;
C1 = 2;
C2 = 2;
optical_center[0] = 172.766;
optical_center[1] = 306.3046;
for(int k = 0;k<circulate;k++)
{
if(tidx < PNum1)
{
if( k <= 1500)
W = Wmax - k*(Wmax-Wmin)/circulate;
else
W = Wmin;
for(int j=0; j<PDim1; j++) //合并访问优化的问题
{
*(CudaDat + (PDim1 + j)*PNum1 + tidx) = W*(*(CudaDat + (PDim1 + j)*PNum1 + tidx)) +//修改速度
myrand1(x,y,z,c,tidx)*C1*(*(CudaDat + (2*PDim1 + j)*PNum1 + tidx) -(*(CudaDat + j*PNum1 + tidx)) )+
myrand1(x,y,z,c,tidx)*C2*(*(CudaDat + (2*PDim1 + j)*PNum1 + GBestIndex) -(*(CudaDat + j*PNum1 + tidx)));
if(*(CudaDat + (PDim1 + j)*PNum1 + tidx) > Vmax_GPU[j]) *(CudaDat + (PDim1 + j)*PNum1 + tidx) = Vmax_GPU[j];
if(CudaDat[(PDim1 + j)*PNum1 + tidx]<-Vmax_GPU[j]) *(CudaDat + (PDim1 + j)*PNum1 + tidx) = -Vmax_GPU[j];
*(CudaDat + j*PNum1 + tidx) += (*(CudaDat + (PDim1 + j)*PNum1 + tidx)); //修改坐标
if(CudaDat[j*PNum1 + tidx]>pose_Xup_GPU[j])
CudaDat[j*PNum1 + tidx]=pose_Xup_GPU[j];//保护
if(CudaDat[j*PNum1 + tidx]<pose_Xdown_GPU[j])
CudaDat[j*PNum1 + tidx]=pose_Xdown_GPU[j];
}
for(int i = 0;i<17;i++)
test = CudaDat[i*PNum1];
//////////////////////////////////////GetFit////////////////////////////////
double cost = 0;
double angle_cos[3],angle_sin[3];
angle_cos[0] = cos(*(CudaDat + 3*PNum1 + tidx));
angle_cos[1] = cos(*(CudaDat + 4*PNum1 + tidx));
angle_cos[2] = cos(*(CudaDat + 5*PNum1 + tidx));
angle_sin[0] = sin(*(CudaDat + 3*PNum1 + tidx));
angle_sin[1] = sin(*(CudaDat + 4*PNum1 + tidx));
angle_sin[2] = sin(*(CudaDat + 5*PNum1 + tidx));
test[1] = *(CudaDat + 3*PNum1 + tidx);
for(int i=0; i< FeaturesNum1; i++)
{
double alpasx = 0,alpasy =0,alpasz=0;
for(int j=0; j<10; j++)
{
alpasx += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[0][j];
alpasy += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[1][j];
alpasz += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[2][j];
}
double xx = Featuresinfo_GPU[0] + alpasx;
double xy = Featuresinfo_GPU[1] + alpasy;
double xz = Featuresinfo_GPU[2] + alpasz;
/*double xx = Featuresinfo[0];
double xy = Featuresinfo[1];
double xz = Featuresinfo[2];
double wx = xx*angle_cos[2]*angle_cos[1] + xy*(angle_cos[2]*angle_sin[1]*angle_sin[0]-angle_sin[2]*angle_cos[0])
+ xz*(angle_cos[2]*angle_sin[1]*angle_cos[0]+angle_sin[2]*angle_sin[0]) +*(CudaDat+ tidx);
double wy = xx*angle_sin[2]*angle_cos[1] + xy*(angle_sin[2]*angle_sin[1]*angle_sin[0]+angle_cos[2]*angle_cos[0])
+ xz*(angle_sin[2]*angle_sin[1]*angle_cos[0]-angle_cos[2]*angle_sin[0]) + *(CudaDat + 1*PNum1 + tidx);
double wz = -xx*angle_sin[1] + xy*angle_cos[1]*angle_sin[0] +xz*angle_cos[1]*angle_cos[0] + *(CudaDat + 2*PNum1 + tidx);
double px = optical_center[0] + *(CudaDat + 6*PNum1 + tidx)*(wx/wz) - (double)pso_lanmark_GPU[0];
double py = optical_center[1] - *(CudaDat + 6*PNum1 + tidx)*(wy/wz) - (double)pso_lanmark_GPU[1];
cost = cost + px*px + py*py;
//cost = cost + abs(px)+abs(py);
}
Fit[tidx] = -1*cost; //Fit得到全部一样的值
if( Fit[tidx] >= *(CudaDat + (3*PDim1+1)*PNum1 + tidx))
{
*(CudaDat + (3*PDim1+1)*PNum1 + tidx) = Fit[tidx];
for(int j=0; j<PDim1; j++)
*(CudaDat + (2*PDim1+j)*PNum1 + tidx) = *(CudaDat + j*PNum1 + tidx);
}
}
__syncthreads();
if(tidx < PNum1)
{
if(tidx == 0) //用第一个线程计算GBestIndex
{
GBestIndex = 0;
for(int i=0; i<PNum1; i++)
if(*(CudaDat + (3*PDim1+1)*PNum1 + i) >= *(CudaDat + (3*PDim1+1)*PNum1 + GBestIndex) && i!=GBestIndex) GBestIndex = i;
}
}
}
__syncthreads();
if(tidx == 0 )
{
for(int i = 0;i<isize;i++)
{
//data_out = CudaDat[GBestIndex*isize + i];
data_out = *(CudaDat + i*PNum1 + GBestIndex);
}
}
}
extern "C"
clock_t ParticleFly_CPU(double* CudaDat,double* data_out,int isize,int circulate,int pso_lanmark1[5][3])
{
double Featuresinfo1[FeaturesNum1][3]={{-31.1956,34.2044,94.8308},{30.3130,34.2285,94.5579},{0.0072,10.1444,128.9015},{-29.3909,-33.8801, 98.1061},{29.1029,-33.5093,97.8227}};
double alpas1[FeaturesNum1][3][10]=.......................................
double pose_Xup1[] = { 450, 450,1000, M_PI1/2, M_PI1/2, M_PI1/2, 900, 0.002653, 0.0016677, 0.0013104, 0.0009396, 0.0008288, 0.0006270, 0.0005781, 0.0005434, 0.0005332, 0.0004848}; //自变量上界
double pose_Xdown1[] = {-450,-450,1000,-M_PI1/2,-M_PI1/2,-M_PI1/2,-900,-0.002653, -0.0016677,-0.0013104,-0.0009396,-0.0008288,-0.0006270,-0.0005781,-0.0005434,-0.0005332,-0.0004848}; //自变量下界
double Vmax[PDim1];
for(int i=0; i<PDim1; i++)
Vmax = (pose_Xup1-pose_Xdown1)*0.1;
double* CudaDat_GPU;
double* data_out_GPU;
clock_t start = clock();
cudaMalloc((void**) &CudaDat_GPU,sizeof(double)*PNum1*isize);
cudaMalloc((void**) &data_out_GPU,sizeof(double)*isize);
cudaMemset(CudaDat_GPU, 0, sizeof(double)*PNum1*isize);
cudaMemset(data_out, 0, sizeof(double)*isize);
clock_t start1 = clock();
cudaMemcpy(CudaDat_GPU,CudaDat,sizeof(double)*PNum1*isize,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( pso_lanmark_GPU, pso_lanmark1,
sizeof(int) *5*3);
cudaMemcpyToSymbol( Featuresinfo_GPU, Featuresinfo1,
sizeof(double) * 5*3);
cudaMemcpyToSymbol( alpas_GPU, alpas1,
sizeof(int) *5* 3*10);
cudaMemcpyToSymbol( pose_Xup_GPU, pose_Xup1,
sizeof(double) * 17);
cudaMemcpyToSymbol( pose_Xdown_GPU, pose_Xdown1,
sizeof(double) * 17);
cudaMemcpyToSymbol( Vmax_GPU, Vmax,
sizeof(double) * 17);
clock_t end1 = clock();
ParticleFly_GPU<<<1,THREDANUM1>>>(CudaDat_GPU,data_out_GPU,isize,circulate);
cudaThreadSynchronize();
clock_t end2 = clock();
cudaMemcpy(data_out,data_out_GPU,sizeof(double)*isize,cudaMemcpyDeviceToHost);
clock_t end = clock();
cudaFree(CudaDat_GPU);
cudaFree(data_out_GPU);
cout<<start1 - start<<" ------- "<<end1 - start1<<"------"<<end2-end1<<"------"<<end-end2<<endl;
return end - start;
}
最近在对一个PSO算法进行进行CUDA编程优化,我的配置是pentium dual coreE5300 2.6FHZ,GTX650TI显卡,这个CPU函数调用四次,通过clock()函数测量时间,得到cudaMalloc 部分第一次时间消耗70 毫秒,核函数时间消耗200毫秒左右,对于传入数据,我已经做了一些合并访问的修改,性能提高了2倍左右(以前核函数400毫秒)。
我想问的是-------
对与这个函数,还有哪些地方可以改进的地方,我觉得这个运行速度还有很大的提升空间,自己刚刚学校CUDA不久,跪求指点一二!!!!!