跪求帮助，不确定是内核函数的问题还是cudaMallocPitch&cudaMemcpy2D的使用问题

system · 2012 年2 月 10 日 12:20

1.先说一下我想做什么吧。这部分程序是想计算一个矩阵K，[attach]2668[/attach]x,y 都是6维的输入向量，比如x={1,2,3,4,5,6}；输入向量的个数是10000个(float型),但是我先打算实现9个输入向量(x0,x1,…,x8)的K矩阵；形成之后应该如下：[attach]2669[/attach] 2.然后是我打算的实现方法，我打算仿照矩阵乘法来实现；输入向量存在文件中(附件)，读入两个数组中h_A[8][6],h_B[6][8]; [attach]2670[/attach]X0…X8。都是6维向量。那么求K=h_C[8][8],结构：[attach]2672[/attach]每一个K(xj,xi)都是对应向量的每个元素差的二范数在求指数。 3. 代码主要放在cudaLSSVM.cu文件中LSSVM.cpp是CPU实现K矩阵（或者说RBF函数矩阵），用于对比数据误差；不知道什么问题，结果总是数据不对（与CPU计算结果相差太大）或者是少了一部分结果。代码如下：

没地方了在二楼吧首先是.cu文件，然后是cpp中的

system · 2012 年2 月 10 日 12:22

//////////////////////////////////////////////////////////////////////////
// Utilities and system includes
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

#include "shrUtils.h"
#include "shrQATest.h"
#include "cutil_inline.h"
#include "cutil.h"
#include "LSSVM.h"

static char *sample = "LS-SVM";

#define THREAD_NUM 256	 //

#define VECTOR_NUM   8   // number of the vector data
#define VECTOR_WIDTH 6   // width of the vector 

float h_A[VECTOR_NUM][VECTOR_WIDTH];		// vector x0-x7 
float h_B[VECTOR_WIDTH][VECTOR_NUM];		// vector x1-x8
float h_C[VECTOR_NUM][VECTOR_NUM];			// GPU Matrix RBF 
float reference[VECTOR_NUM][VECTOR_NUM];	// CPU Matrix RBF

////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void dataInit_A();		// initiate h_A to x0-x7
void dataInit_B();		// initiate h_B to x1-x8

// compare the result between GPU and CPU
shrBOOL compareDiff( float[][8] , float[][8] , const float );

// compute the RBF kernel via CPU
extern "C"
void cpuRBF( float[][8], float[][6], float[][8], unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////
// cudaLSSVM step 1: compute RBF Matrix
////////////////////////////////////////////////////////////////////////////////

// kernel function
__global__ static void RBF_Kernel(float* A, float* B,float* C)
{
	const int tid = threadIdx.x;
	const int bid = blockIdx.x;
	const int idx = bid*blockDim.x+tid;

	const int row = idx / VECTOR_NUM;
	const int col = idx % VECTOR_NUM;

// 
	if(row < VECTOR_NUM && col < VECTOR_NUM)
	{
		float t=0;
		for(int i=0;i < VECTOR_WIDTH;i++)
		{
			t+=(A[row * VECTOR_WIDTH + i] - B[i * VECTOR_NUM + col]) * (B[i * VECTOR_NUM + col] - A[row * VECTOR_WIDTH + i]);

		}

		C[row*VECTOR_NUM+col] = expf(t);
	}

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char* argv[])
{
	shrQAStart(argc, argv);
	printf("[ %s ]\n", sample);

	printf("...Starting LS-SVM test...\n\n");

	cutilDeviceInit(argc, argv);
	int devID;
	cudaDeviceProp props;

	// get number of SMs on this GPU
	cutilSafeCall(cudaGetDevice(&devID));
	cutilSafeCall(cudaGetDeviceProperties(&props, devID));

	// display the GPU
	printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name, props.major, props.minor);
	printf("< Global memory available on device in bytes: %d \n< Shared memory available per block in bytes: %d \n< Maximum number of threads per block: %d \n< Number of multiprocessors on device: %d \n ", 
		props.totalGlobalMem, props.sharedMemPerBlock, props.maxThreadsPerBlock, props.multiProcessorCount);
	
	//Initiate the vector data
	dataInit_A();
	dataInit_B();

	// allocate device memory
	float* d_A, *d_B, *d_C;
	size_t pitch_A, pitch_B, pitch_C;

	cutilSafeCall(cudaMallocPitch((void**)&d_A,&pitch_A,VECTOR_WIDTH*sizeof(float),VECTOR_NUM));
	cutilSafeCall(cudaMallocPitch((void**)&d_B,&pitch_B,VECTOR_NUM*sizeof(float),VECTOR_WIDTH));
	cutilSafeCall(cudaMallocPitch((void**)&d_C,&pitch_C,VECTOR_NUM*sizeof(float),VECTOR_NUM));

	// copy host memory to device
	cutilSafeCall(cudaMemcpy2D(d_A,pitch_A,h_A,sizeof(float)*VECTOR_WIDTH,sizeof(float)*VECTOR_WIDTH,VECTOR_NUM,cudaMemcpyHostToDevice));
	cutilSafeCall(cudaMemcpy2D(d_B,pitch_B,h_B,sizeof(float)*VECTOR_NUM,sizeof(float)*VECTOR_NUM,VECTOR_WIDTH,cudaMemcpyHostToDevice));

	// create and start timer
	printf("Runing Kernels...\n\n");
	unsigned int timer_LSSVM = 0;

	// Start Timing	
	cutilCheckError(cutCreateTimer(&timer_LSSVM));
	cutilCheckError(cutStartTimer(timer_LSSVM));

	// setup execution parameters
	int BLOCK_NUM=(VECTOR_NUM+THREAD_NUM-1)/THREAD_NUM;

	RBF_Kernel<<<BLOCK_NUM*VECTOR_NUM, THREAD_NUM>>>(d_A,d_B,d_C);

	// check if kernel execution generated and error
	cutilCheckMsg("CUDA LSSVM Kernel execution failed");

	cutilDeviceSynchronize();

	// stop and destroy timer
	cutilCheckError(cutStopTimer(timer_LSSVM));

	double dSeconds = cutGetTimerValue(timer_LSSVM)/1000.0;
	double dNumOps = 2.0 * (double)VECTOR_WIDTH * (double)VECTOR_NUM * (double)VECTOR_NUM;
	double gflops = 1.0e-9 * dNumOps/dSeconds;

	//Log througput
	printf("> CUDA LSSVM Throughput = %.4f GFlop/s, Time = %.5f s, Size = %.0f Ops, ", 
		gflops, dSeconds, dNumOps);

	cutilCheckError(cutDeleteTimer(timer_LSSVM));

	// copy result from device to host
	cutilSafeCall(cudaMemcpy2D(h_C,sizeof(float)*VECTOR_NUM,d_C,pitch_C,sizeof(float)*VECTOR_NUM,VECTOR_NUM,cudaMemcpyDeviceToHost));

	// compute reference solution
	printf("\nComparing GPU results with CPU...\n\n");

	cpuRBF(reference, h_A, h_B, VECTOR_NUM, VECTOR_WIDTH);

	// check result (compute RBF kernel)
	printf("GPU Vs CPU\n");
 	shrBOOL resCUDA = compareDiff(reference, h_C, 1.0e-6f);

 	printf("CUDA RBF compares %s\n\n", (shrTRUE == resCUDA) ? "OK" : "FAIL");

	// clean up memory

	cutilSafeCall(cudaFree(d_A));
	cutilSafeCall(cudaFree(d_B));
	cutilSafeCall(cudaFree(d_C));

	cutilDeviceReset();

	shrQAFinishExit(argc, (const char **)argv, (resCUDA == shrTRUE) ? QA_PASSED : QA_FAILED);

	return 0;
}

void dataInit_A()
{
	FILE *fp1;
	fp1=fopen("D:\\CUDASDK_40_win64\\C\\src\\BAO_LSSVM_1\\x0_x7.txt","r");
	if (NULL==fp1)
	{
		printf("Error: can't open the file! \n");
	}

	// read data from the file
	for(int i=0;i<VECTOR_NUM;i++)
		for(int j=0;j<VECTOR_WIDTH;j++)
			fscanf(fp1,"%f",&h_A[i][j]);
	fclose(fp1);
}

void dataInit_B()
{
	FILE *fp2;
	fp2=fopen("D:\\CUDASDK_40_win64\\C\\src\\BAO_LSSVM_1\\x1_x8.txt","r");
	if (NULL==fp2)
	{
		printf("Error: can't open the file! \n");
	}
	// read data from the file
	for(int j=0;j<VECTOR_NUM;j++)
		for(int i=0;i<VECTOR_WIDTH;i++)
			fscanf(fp2,"%f",&h_B[i][j]);
	fclose(fp2);
}


shrBOOL compareDiff( float reference[][8], float data[][8], const float epsilon )
{

	ARGCHECK(epsilon >= 0);

	float error = 0;
	float ref   = 0;

	for(unsigned int i=0; i<VECTOR_NUM; i++)
	{
		for (unsigned int j=0; j<VECTOR_NUM; j++)
			{
				float diff = reference[i][j] - data[i][j];
				error += diff * diff;
				ref += reference[i][j] * reference[i][j];
			}
	}
	
	float normRef = sqrtf(ref);
	if (fabs(ref) < 1e-7)
	{
#ifdef _DEBUG

		std::cerr << "ERROR, reference l2-norm is 0\n";
#endif
		return shrFALSE;
	}

	float normError = sqrtf(error);
	error = normError / normRef;
	bool result = error < epsilon;
#ifdef _DEBUG
	if( ! result) 
	{
		std::cerr << "ERROR, l2-norm error " 
			<< error << " is greater than epsilon " << epsilon << "\n";
	}
#endif
	
	 return result ? shrTRUE : shrFALSE;
}

system · 2012 年2 月 10 日 12:24

////////////////////////////////////////////////////////////////////////////////
// export C interface
extern "C"
void cpuRBF( float[][8], float[][6], float[][8], unsigned int, unsigned int);

////////////////////////////////////////////////////////////////////////////////
void cpuRBF(float cpu_C[][8], float cpu_A[][6], float cpu_B[][8], unsigned int Height, unsigned int Width)
{
	for (unsigned int i = 0; i < Height; i++)
		for (unsigned int j = 0; j < Height; j++)
		{
			double k_RBF = 0;
			for (unsigned int k = 0; k < Width; k++)
			{
				double a = cpu_A[i][k];
				double b = cpu_B[k][j];
				k_RBF += (a - b) * (b - a);
			}
			cpu_C[i][j] = expf((float)k_RBF);
		}
}

system · 2012 年2 月 10 日 12:26

求助各位了，小弟先谢过了。

感觉问题集中在内核函数，不清楚啊~
数据少一部分，或者数组后面的数据重复。。。:Q

system · 2012 年2 月 11 日 12:36

你可以将处理的规模缩小，
调试跟踪一下啊
用cuda-gdb或nsight。

system · 2012 年2 月 13 日 06:30

解决了