求解Ax=B的超定方程,使用cusolver库得到的结果与使用OpenCV中的cv::solve得到的结果不同,具体代码如下:
int linearSolverQR(cusolverDnHandle_t handle,int n,const float *Acopy,int lda,const float *b,float *x)
{
cublasHandle_t cublasHandle = NULL; // used in residual evaluation
int bufferSize = 0;
int bufferSize_geqrf = 0;
int bufferSize_ormqr = 0;
int *info = NULL;
float *buffer = NULL;
float *A = NULL;
float *tau = NULL;
int h_info = 0;
float start, stop;
float time_solve;
const float one = 1.0;
Check_Cuda_Error(cublasCreate(&cublasHandle));
Check_Cuda_Error(cusolverDnSgeqrf_bufferSize(handle, n, n, (float*)Acopy, lda, &bufferSize_geqrf));
Check_Cuda_Error(cusolverDnSormqr_bufferSize(handle,CUBLAS_SIDE_LEFT,CUBLAS_OP_T,n,1,n,A,lda,NULL,x,n,&bufferSize_ormqr));
bufferSize = (bufferSize_geqrf > bufferSize_ormqr) ? bufferSize_geqrf : bufferSize_ormqr;
Check_Cuda_Error(cudaMalloc(&info, sizeof(int)));
Check_Cuda_Error(cudaMalloc(&buffer, sizeof(float)*bufferSize));
Check_Cuda_Error(cudaMalloc(&A, sizeof(float)ldan));
Check_Cuda_Error(cudaMalloc((void**)&tau, sizeof(float)*n));
// prepare a copy of A because getrf will overwrite A with L
Check_Cuda_Error(cudaMemcpy(A, Acopy, sizeof(float)ldan, cudaMemcpyDeviceToDevice));
Check_Cuda_Error(cudaMemset(info, 0, sizeof(int)));
// compute QR factorization
Check_Cuda_Error(cusolverDnSgeqrf(handle, n, n, A, lda, tau, buffer, bufferSize, info));
Check_Cuda_Error(cudaMemcpy(&h_info, info, sizeof(int), cudaMemcpyDeviceToHost));
if (0 != h_info)
fprintf(stderr, “Error: LU factorization failed\n”);
Check_Cuda_Error(cudaMemcpy(x, b, sizeof(float)*n, cudaMemcpyDeviceToDevice));
// compute Q^T*b
Check_Cuda_Error(cusolverDnSormqr(handle,CUBLAS_SIDE_LEFT,CUBLAS_OP_T,n,1,n,A,lda,tau,x,n,buffer,bufferSize,info));
// x = R \ Q^T*b
Check_Cuda_Error(cublasStrsm(cublasHandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,n,1,&one,A,lda,x,n));
Check_Cuda_Error(cudaDeviceSynchronize());
stop = second();
if (cublasHandle) { Check_Cuda_Error(cublasDestroy(cublasHandle)); }
if (info) { Check_Cuda_Error(cudaFree(info)); }
if (buffer) { Check_Cuda_Error(cudaFree(buffer)); }
if (A) { Check_Cuda_Error(cudaFree(A)); }
if (tau) { Check_Cuda_Error(cudaFree(tau)); }
return 0;
}
void main()
{
const int ciValidPointsNum = 30;
cv::FileStorage fs(“…/TestData/MatAB.xml”, cv::FileStorage::READ);
//CPU OpenCV solver
cv::Mat matA = cv::Mat::zeros(4, ciValidPointsNum, CV_32FC1);
cv::Mat matB1 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB2 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB3 = cv::Mat::zeros(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matB4 = cv::Mat::ones(ciValidPointsNum, 1, CV_32FC1);
cv::Mat matX1 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX2 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX3 = cv::Mat::zeros(4, 1, CV_32FC1);
cv::Mat matX4 = cv::Mat::zeros(4, 1, CV_32FC1);
fs[“MatA”] >> matA;
fs[“MatB1”] >> matB1;
fs[“MatB2”] >> matB2;
fs[“MatB3”] >> matB3;
fs.release();
matA.resize(ciValidPointsNum);
matB1.resize(ciValidPointsNum);
matB2.resize(ciValidPointsNum);
matB3.resize(ciValidPointsNum);
std::cout << matA << std::endl << matB1 << std::endl << matB2 << std::endl << matB3 << std::endl;
cv::Mat matACopy = matA.clone();
cv::Mat matB1Copy = matB1.clone();
cv::Mat matB2Copy = matB2.clone();
cv::Mat matB3Copy = matB3.clone();
cv::Mat matB4Copy = matB4.clone();
cusolverDnHandle_t handle = NULL;
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
int rowsA = ciValidPointsNum;
int colsA = 4;
int lda = ciValidPointsNum;
float *d_A = NULL;
float *d_b1 = NULL;
float *d_b2 = NULL;
float *d_b3 = NULL;
float *d_b4 = NULL;
float *d_x1 = NULL;
float *d_x2 = NULL;
float *d_x3 = NULL;
float *d_x4 = NULL;
float* h_A = (float*)malloc(sizeof(float)ldacolsA);
float* h_b1 = (float*)malloc(sizeof(float)*rowsA);
float* h_b2 = (float*)malloc(sizeof(float)*rowsA);
float* h_b3 = (float*)malloc(sizeof(float)*rowsA);
float* h_b4 = (float*)malloc(sizeof(float)*rowsA);
float* h_x1 = (float*)malloc(sizeof(float)*colsA);
float* h_x2 = (float*)malloc(sizeof(float)*colsA);
float* h_x3 = (float*)malloc(sizeof(float)*colsA);
float* h_x4 = (float*)malloc(sizeof(float)*colsA);
matACopy = matACopy.t();
memcpy(h_A, matACopy.data, sizeof(float) * 4 * ciValidPointsNum);
memcpy(h_b1, matB1Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b2, matB2Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b3, matB3Copy.data, sizeof(float) * 1 * ciValidPointsNum);
memcpy(h_b4, matB4Copy.data, sizeof(float) * 1 * ciValidPointsNum);
Check_Cuda_Error(cudaMalloc((void **)&d_A, sizeof(float)ldacolsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b1, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b2, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b3, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_b4, sizeof(float)*rowsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x1, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x2, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x3, sizeof(float)*colsA));
Check_Cuda_Error(cudaMalloc((void **)&d_x4, sizeof(float)*colsA));
Check_Cuda_Error(cudaMemcpy(d_A, h_A, sizeof(float)ldacolsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b1, h_b1, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b2, h_b2, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b3, h_b3, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cudaMemcpy(d_b4, h_b4, sizeof(float)*rowsA, cudaMemcpyHostToDevice));
Check_Cuda_Error(cusolverDnCreate(&handle));
Check_Cuda_Error(cublasCreate(&cublasHandle));
Check_Cuda_Error(cudaStreamCreate(&stream));
linearSolverQR(handle, colsA, d_A, lda, d_b1, d_x1);
linearSolverQR(handle, colsA, d_A, lda, d_b2, d_x2);
linearSolverQR(handle, colsA, d_A, lda, d_b3, d_x3);
linearSolverQR(handle, colsA, d_A, lda, d_b4, d_x4);
Check_Cuda_Error(cudaMemcpy(h_x1, d_x1, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x2, d_x2, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x3, d_x3, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
Check_Cuda_Error(cudaMemcpy(h_x4, d_x4, sizeof(float)*colsA, cudaMemcpyDeviceToHost));
cv::Mat TestTi_GPU = cv::Mat::zeros(4, 4, CV_32FC1);
float * ptrTestTi_GPU = (float *)TestTi_GPU.data;
for (int i = 0; i < 4; i++)
{
ptrTestTi_GPU[0 * 4 + i] = static_cast(h_x1[i]);
ptrTestTi_GPU[1 * 4 + i] = static_cast(h_x2[i]);
ptrTestTi_GPU[2 * 4 + i] = static_cast(h_x3[i]);
ptrTestTi_GPU[3 * 4 + i] = static_cast(h_x4[i]);
}
std::cout << TestTi_GPU << std::endl;
}
[attach]18408[/attach][attach]18409[/attach][attach]18410[/attach][attach]18411[/attach]
[attach]18412[/attach]