//////////////////////////////////////////////////////////////////////////
// Utilities and system includes
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include "shrUtils.h"
#include "shrQATest.h"
#include "cutil_inline.h"
#include "cutil.h"
#include "LSSVM.h"
static char *sample = "LS-SVM";
#define THREAD_NUM 256 //
#define VECTOR_NUM 8 // number of the vector data
#define VECTOR_WIDTH 6 // width of the vector
float h_A[VECTOR_NUM][VECTOR_WIDTH]; // vector x0-x7
float h_B[VECTOR_WIDTH][VECTOR_NUM]; // vector x1-x8
float h_C[VECTOR_NUM][VECTOR_NUM]; // GPU Matrix RBF
float reference[VECTOR_NUM][VECTOR_NUM]; // CPU Matrix RBF
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void dataInit_A(); // initiate h_A to x0-x7
void dataInit_B(); // initiate h_B to x1-x8
// compare the result between GPU and CPU
shrBOOL compareDiff( float[][8] , float[][8] , const float );
// compute the RBF kernel via CPU
extern "C"
void cpuRBF( float[][8], float[][6], float[][8], unsigned int, unsigned int);
////////////////////////////////////////////////////////////////////////////////
// cudaLSSVM step 1: compute RBF Matrix
////////////////////////////////////////////////////////////////////////////////
// kernel function
__global__ static void RBF_Kernel(float* A, float* B,float* C)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int idx = bid*blockDim.x+tid;
const int row = idx / VECTOR_NUM;
const int col = idx % VECTOR_NUM;
//
if(row < VECTOR_NUM && col < VECTOR_NUM)
{
float t=0;
for(int i=0;i < VECTOR_WIDTH;i++)
{
t+=(A[row * VECTOR_WIDTH + i] - B[i * VECTOR_NUM + col]) * (B[i * VECTOR_NUM + col] - A[row * VECTOR_WIDTH + i]);
}
C[row*VECTOR_NUM+col] = expf(t);
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char* argv[])
{
shrQAStart(argc, argv);
printf("[ %s ]\n", sample);
printf("...Starting LS-SVM test...\n\n");
cutilDeviceInit(argc, argv);
int devID;
cudaDeviceProp props;
// get number of SMs on this GPU
cutilSafeCall(cudaGetDevice(&devID));
cutilSafeCall(cudaGetDeviceProperties(&props, devID));
// display the GPU
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);
printf("< Global memory available on device in bytes: %d \n< Shared memory available per block in bytes: %d \n< Maximum number of threads per block: %d \n< Number of multiprocessors on device: %d \n ",
props.totalGlobalMem, props.sharedMemPerBlock, props.maxThreadsPerBlock, props.multiProcessorCount);
//Initiate the vector data
dataInit_A();
dataInit_B();
// allocate device memory
float* d_A, *d_B, *d_C;
size_t pitch_A, pitch_B, pitch_C;
cutilSafeCall(cudaMallocPitch((void**)&d_A,&pitch_A,VECTOR_WIDTH*sizeof(float),VECTOR_NUM));
cutilSafeCall(cudaMallocPitch((void**)&d_B,&pitch_B,VECTOR_NUM*sizeof(float),VECTOR_WIDTH));
cutilSafeCall(cudaMallocPitch((void**)&d_C,&pitch_C,VECTOR_NUM*sizeof(float),VECTOR_NUM));
// copy host memory to device
cutilSafeCall(cudaMemcpy2D(d_A,pitch_A,h_A,sizeof(float)*VECTOR_WIDTH,sizeof(float)*VECTOR_WIDTH,VECTOR_NUM,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy2D(d_B,pitch_B,h_B,sizeof(float)*VECTOR_NUM,sizeof(float)*VECTOR_NUM,VECTOR_WIDTH,cudaMemcpyHostToDevice));
// create and start timer
printf("Runing Kernels...\n\n");
unsigned int timer_LSSVM = 0;
// Start Timing
cutilCheckError(cutCreateTimer(&timer_LSSVM));
cutilCheckError(cutStartTimer(timer_LSSVM));
// setup execution parameters
int BLOCK_NUM=(VECTOR_NUM+THREAD_NUM-1)/THREAD_NUM;
RBF_Kernel<<<BLOCK_NUM*VECTOR_NUM, THREAD_NUM>>>(d_A,d_B,d_C);
// check if kernel execution generated and error
cutilCheckMsg("CUDA LSSVM Kernel execution failed");
cutilDeviceSynchronize();
// stop and destroy timer
cutilCheckError(cutStopTimer(timer_LSSVM));
double dSeconds = cutGetTimerValue(timer_LSSVM)/1000.0;
double dNumOps = 2.0 * (double)VECTOR_WIDTH * (double)VECTOR_NUM * (double)VECTOR_NUM;
double gflops = 1.0e-9 * dNumOps/dSeconds;
//Log througput
printf("> CUDA LSSVM Throughput = %.4f GFlop/s, Time = %.5f s, Size = %.0f Ops, ",
gflops, dSeconds, dNumOps);
cutilCheckError(cutDeleteTimer(timer_LSSVM));
// copy result from device to host
cutilSafeCall(cudaMemcpy2D(h_C,sizeof(float)*VECTOR_NUM,d_C,pitch_C,sizeof(float)*VECTOR_NUM,VECTOR_NUM,cudaMemcpyDeviceToHost));
// compute reference solution
printf("\nComparing GPU results with CPU...\n\n");
cpuRBF(reference, h_A, h_B, VECTOR_NUM, VECTOR_WIDTH);
// check result (compute RBF kernel)
printf("GPU Vs CPU\n");
shrBOOL resCUDA = compareDiff(reference, h_C, 1.0e-6f);
printf("CUDA RBF compares %s\n\n", (shrTRUE == resCUDA) ? "OK" : "FAIL");
// clean up memory
cutilSafeCall(cudaFree(d_A));
cutilSafeCall(cudaFree(d_B));
cutilSafeCall(cudaFree(d_C));
cutilDeviceReset();
shrQAFinishExit(argc, (const char **)argv, (resCUDA == shrTRUE) ? QA_PASSED : QA_FAILED);
return 0;
}
void dataInit_A()
{
FILE *fp1;
fp1=fopen("D:\\CUDASDK_40_win64\\C\\src\\BAO_LSSVM_1\\x0_x7.txt","r");
if (NULL==fp1)
{
printf("Error: can't open the file! \n");
}
// read data from the file
for(int i=0;i<VECTOR_NUM;i++)
for(int j=0;j<VECTOR_WIDTH;j++)
fscanf(fp1,"%f",&h_A[i][j]);
fclose(fp1);
}
void dataInit_B()
{
FILE *fp2;
fp2=fopen("D:\\CUDASDK_40_win64\\C\\src\\BAO_LSSVM_1\\x1_x8.txt","r");
if (NULL==fp2)
{
printf("Error: can't open the file! \n");
}
// read data from the file
for(int j=0;j<VECTOR_NUM;j++)
for(int i=0;i<VECTOR_WIDTH;i++)
fscanf(fp2,"%f",&h_B[i][j]);
fclose(fp2);
}
shrBOOL compareDiff( float reference[][8], float data[][8], const float epsilon )
{
ARGCHECK(epsilon >= 0);
float error = 0;
float ref = 0;
for(unsigned int i=0; i<VECTOR_NUM; i++)
{
for (unsigned int j=0; j<VECTOR_NUM; j++)
{
float diff = reference[i][j] - data[i][j];
error += diff * diff;
ref += reference[i][j] * reference[i][j];
}
}
float normRef = sqrtf(ref);
if (fabs(ref) < 1e-7)
{
#ifdef _DEBUG
std::cerr << "ERROR, reference l2-norm is 0\n";
#endif
return shrFALSE;
}
float normError = sqrtf(error);
error = normError / normRef;
bool result = error < epsilon;
#ifdef _DEBUG
if( ! result)
{
std::cerr << "ERROR, l2-norm error "
<< error << " is greater than epsilon " << epsilon << "\n";
}
#endif
return result ? shrTRUE : shrFALSE;
}