cusolver的求解结果与OpenCV中cv::solve不同

求解Ax=B的超定方程,使用cusolver库得到的结果与使用OpenCV中的cv::solve得到的结果不同,具体代码如下:

int linearSolverQR(cusolverDnHandle_t handle,int n,const float *Acopy,int lda,const float *b,float *x)

{

cublasHandle_t cublasHandle = NULL; // used in residual evaluation

int bufferSize = 0;

int bufferSize_geqrf = 0;

int bufferSize_ormqr = 0;

int *info = NULL;

float *buffer = NULL;

float *A = NULL;

float *tau = NULL;

int h_info = 0;

float start, stop;

float time_solve;

const float one = 1.0;

Check_Cuda_Error(cublasCreate(&cublasHandle));

Check_Cuda_Error(cusolverDnSgeqrf_bufferSize(handle, n, n, (float*)Acopy, lda, &bufferSize_geqrf));

Check_Cuda_Error(cusolverDnSormqr_bufferSize(handle,CUBLAS_SIDE_LEFT,CUBLAS_OP_T,n,1,n,A,lda,NULL,x,n,&bufferSize_ormqr));

bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf : bufferSize_ormqr;

Check_Cuda_Error(cudaMalloc(&info, sizeof(int)));

Check_Cuda_Error(cudaMalloc(&buffer, sizeof(float)*bufferSize));

Check_Cuda_Error(cudaMalloc(&A, sizeof(float)ldan));

Check_Cuda_Error(cudaMalloc((void**)&tau, sizeof(float)*n));

// prepare a copy of A because getrf will overwrite A with L

Check_Cuda_Error(cudaMemcpy(A, Acopy, sizeof(float)ldan, cudaMemcpyDeviceToDevice));

Check_Cuda_Error(cudaMemset(info, 0, sizeof(int)));

// compute QR factorization

Check_Cuda_Error(cusolverDnSgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));

Check_Cuda_Error(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));

if (0 != h_info)

fprintf(stderr, “Error: LU factorization failed\n”);

Check_Cuda_Error(cudaMemcpy(x, b, sizeof(float)*n, cudaMemcpyDeviceToDevice));

// compute Q^T*b

Check_Cuda_Error(cusolverDnSormqr(handle,CUBLAS_SIDE_LEFT,CUBLAS_OP_T,n,1,n,A,lda,tau,x,n,buffer,bufferSize,info));

// x = R \ Q^T*b

Check_Cuda_Error(cublasStrsm(cublasHandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,n,1,&one,A,lda,x,n));

Check_Cuda_Error(cudaDeviceSynchronize());

stop = second();

if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }

if (info) { Check_Cuda_Error(cudaFree(info)); }

if (buffer) { Check_Cuda_Error(cudaFree(buffer)); }

if (A) { Check_Cuda_Error(cudaFree(A)); }

if (tau) { Check_Cuda_Error(cudaFree(tau)); }

return 0;

}

void main()

{

const int ciValidPointsNum = 30;

cv::FileStorage fs(“…/TestData/MatAB.xml”, cv::FileStorage::READ);

//CPU OpenCV solver

cv::Mat matA = cv::Mat::zeros(4, ciValidPointsNum, CV_32FC1);

cv::Mat matB1 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);

cv::Mat matB2 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);

cv::Mat matB3 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);

cv::Mat matB4 = cv::Mat::ones(ciValidPointsNum, 1, CV_32FC1);

cv::Mat matX1 = cv::Mat::zeros(4, 1, CV_32FC1);

cv::Mat matX2 = cv::Mat::zeros(4, 1, CV_32FC1);

cv::Mat matX3 = cv::Mat::zeros(4, 1, CV_32FC1);

cv::Mat matX4 = cv::Mat::zeros(4, 1, CV_32FC1);

fs[“MatA”] >> matA;

fs[“MatB1”] >> matB1;

fs[“MatB2”] >> matB2;

fs[“MatB3”] >> matB3;

fs.release();

matA.resize(ciValidPointsNum);

matB1.resize(ciValidPointsNum);

matB2.resize(ciValidPointsNum);

matB3.resize(ciValidPointsNum);

std::cout << matA << std::endl << matB1 << std::endl << matB2 << std::endl << matB3 << std::endl;

cv::Mat matACopy = matA.clone();

cv::Mat matB1Copy = matB1.clone();

cv::Mat matB2Copy = matB2.clone();

cv::Mat matB3Copy = matB3.clone();

cv::Mat matB4Copy = matB4.clone();

cusolverDnHandle_t handle = NULL;

cublasHandle_t cublasHandle = NULL;

cudaStream_t stream = NULL;

int rowsA = ciValidPointsNum;

int colsA = 4;

int lda = ciValidPointsNum;

float *d_A = NULL;

float *d_b1 = NULL;

float *d_b2 = NULL;

float *d_b3 = NULL;

float *d_b4 = NULL;

float *d_x1 = NULL;

float *d_x2 = NULL;

float *d_x3 = NULL;

float *d_x4 = NULL;

float* h_A = (float*)malloc(sizeof(float)ldacolsA);

float* h_b1 = (float*)malloc(sizeof(float)*rowsA);

float* h_b2 = (float*)malloc(sizeof(float)*rowsA);

float* h_b3 = (float*)malloc(sizeof(float)*rowsA);

float* h_b4 = (float*)malloc(sizeof(float)*rowsA);

float* h_x1 = (float*)malloc(sizeof(float)*colsA);

float* h_x2 = (float*)malloc(sizeof(float)*colsA);

float* h_x3 = (float*)malloc(sizeof(float)*colsA);

float* h_x4 = (float*)malloc(sizeof(float)*colsA);

matACopy = matACopy.t();

memcpy(h_A, matACopy.data, sizeof(float) * 4 * ciValidPointsNum);

memcpy(h_b1, matB1Copy.data, sizeof(float) * 1 * ciValidPointsNum);

memcpy(h_b2, matB2Copy.data, sizeof(float) * 1 * ciValidPointsNum);

memcpy(h_b3, matB3Copy.data, sizeof(float) * 1 * ciValidPointsNum);

memcpy(h_b4, matB4Copy.data, sizeof(float) * 1 * ciValidPointsNum);

Check_Cuda_Error(cudaMalloc((void **)&d_A, sizeof(float)ldacolsA));

Check_Cuda_Error(cudaMalloc((void **)&d_b1, sizeof(float)*rowsA));

Check_Cuda_Error(cudaMalloc((void **)&d_b2, sizeof(float)*rowsA));

Check_Cuda_Error(cudaMalloc((void **)&d_b3, sizeof(float)*rowsA));

Check_Cuda_Error(cudaMalloc((void **)&d_b4, sizeof(float)*rowsA));

Check_Cuda_Error(cudaMalloc((void **)&d_x1, sizeof(float)*colsA));

Check_Cuda_Error(cudaMalloc((void **)&d_x2, sizeof(float)*colsA));

Check_Cuda_Error(cudaMalloc((void **)&d_x3, sizeof(float)*colsA));

Check_Cuda_Error(cudaMalloc((void **)&d_x4, sizeof(float)*colsA));

Check_Cuda_Error(cudaMemcpy(d_A, h_A, sizeof(float)ldacolsA, cudaMemcpyHostToDevice));

Check_Cuda_Error(cudaMemcpy(d_b1, h_b1, sizeof(float)*rowsA, cudaMemcpyHostToDevice));

Check_Cuda_Error(cudaMemcpy(d_b2, h_b2, sizeof(float)*rowsA, cudaMemcpyHostToDevice));

Check_Cuda_Error(cudaMemcpy(d_b3, h_b3, sizeof(float)*rowsA, cudaMemcpyHostToDevice));

Check_Cuda_Error(cudaMemcpy(d_b4, h_b4, sizeof(float)*rowsA, cudaMemcpyHostToDevice));

Check_Cuda_Error(cusolverDnCreate(&handle));

Check_Cuda_Error(cublasCreate(&cublasHandle));

Check_Cuda_Error(cudaStreamCreate(&stream));

linearSolverQR(handle, colsA, d_A, lda, d_b1, d_x1);

linearSolverQR(handle, colsA, d_A, lda, d_b2, d_x2);

linearSolverQR(handle, colsA, d_A, lda, d_b3, d_x3);

linearSolverQR(handle, colsA, d_A, lda, d_b4, d_x4);

Check_Cuda_Error(cudaMemcpy(h_x1, d_x1, sizeof(float)*colsA, cudaMemcpyDeviceToHost));

Check_Cuda_Error(cudaMemcpy(h_x2, d_x2, sizeof(float)*colsA, cudaMemcpyDeviceToHost));

Check_Cuda_Error(cudaMemcpy(h_x3, d_x3, sizeof(float)*colsA, cudaMemcpyDeviceToHost));

Check_Cuda_Error(cudaMemcpy(h_x4, d_x4, sizeof(float)*colsA, cudaMemcpyDeviceToHost));

cv::Mat TestTi_GPU = cv::Mat::zeros(4, 4, CV_32FC1);

float * ptrTestTi_GPU = (float *)TestTi_GPU.data;

for (int i = 0; i < 4; i++)

{

ptrTestTi_GPU[0 * 4 + i] = static_cast(h_x1[i]);

ptrTestTi_GPU[1 * 4 + i] = static_cast(h_x2[i]);

ptrTestTi_GPU[2 * 4 + i] = static_cast(h_x3[i]);

ptrTestTi_GPU[3 * 4 + i] = static_cast(h_x4[i]);

}

std::cout << TestTi_GPU << std::endl;

}

[attach]18408[/attach][attach]18409[/attach][attach]18410[/attach][attach]18411[/attach]

[attach]18412[/attach]