用cusolver库做QR分解到100w阶矩阵出问题了

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include
#include
#include <cusolverSp.h>
#include <cuda_runtime_api.h>
using namespace std;

#define imin( x, y ) ((x)<(y))? (x) : (y)

int main()
{
cusolverSpHandle_t cusolverH = NULL;

//GPU 批量处理 QR
csrqrInfo_t info = NULL;
cusparseMatDescr_t descrA = NULL;

cusparseStatus_t cusparse_status = CUSPARSE_STATUS_SUCCESS;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;

//int n = 4; int nnzA = 7;
int n = 1000000; int nnzA = 2999998;
//int n = 5; int nnzA = 9;

int* h_csrRowPtrA = new int[n + 1];
int* h_csrColIndA = new int[nnzA];
double* h_csrValA = new double[nnzA];
double* h_b = new double[n]; // batchSize * m
double* h_x = new double[n]; // batchSize * m

size_t size_qr = 0;
size_t size_internal = 0;
void* buffer_qr = NULL; // QR 分解的工作空间

/*
//矩阵A的 CSR 格式
h_csrRowPtrA[0] = 1; h_csrRowPtrA[1] = 2; h_csrRowPtrA[2] = 3; h_csrRowPtrA[3] = 4; h_csrRowPtrA[4] = 8;
h_csrColIndA[0] = 1; h_csrColIndA[1] = 2; h_csrColIndA[2] = 3; h_csrColIndA[3] = 1; h_csrColIndA[4] = 2; h_csrColIndA[5] = 3; h_csrColIndA[6] = 4;
h_csrValA[0] = 1.0 ; h_csrValA[1] = 2.0 ; h_csrValA[2] = 3.0 ; h_csrValA[3] = 0.1 ; h_csrValA[4] = 0.1 ; h_csrValA[5] = 0.1 ; h_csrValA[6] = 4.0 ;

//矩阵B的值
h_b[0] = 1.0; h_b[1] = 1.0; h_b[2] = 1.0; h_b[3] = 1.0;
*/

//矩阵A的 CSR 格式(1000000阶)

int* csrRowPtr = new int[n + 1];
ifstream ofs;
ofs.open(“csrRowPtr.txt”, ios::in);
if (!ofs)
{
cerr << “open ofs Fail” << endl;
exit(1);
}
for (int i = 0; i < n + 1; ++i)
{
ofs >> csrRowPtr[i];
h_csrRowPtrA[i] = csrRowPtr[i];
//cout << h_csrRowPtrA[i] << endl;
}

//cout << endl;
int* csrColInd = new int[nnzA];
ifstream ofs1;
ofs1.open(“csrColInd.txt”, ios::in);
if (!ofs1)
{
cerr << “open ofs1 Fail” << endl;
exit(1);
}
for (int i = 0; i < nnzA; ++i)
{
ofs1 >> csrColInd[i];
h_csrColIndA[i] = csrColInd[i];
//cout << h_csrColIndA[i] << endl;
}

//cout << endl;
double* csrVal = new double[nnzA];
ifstream ofs2;
ofs2.open(“csrVal.txt”, ios::in);
if (!ofs2)
{
cerr << “open ofs2 Fail” << endl;
exit(1);
}
for (int i = 0; i < nnzA; ++i)
{
ofs2 >> csrVal[i];
h_csrValA[i] = csrVal[i];
//cout << h_csrValA[i] << endl;
}

//矩阵B的 CSR 格式(1000000阶)
//cout << endl;
double* B = new double[n]; // batchSize * m
ifstream ofs3;
ofs3.open(“B.txt”, ios::in);
if (!ofs3)
{
cerr << “open ofs3 Fail” << endl;
exit(1);
}
for (int i = 0; i < n; ++i)
{
ofs3 >> B[i];
h_b[i] = B[i];
//cout << h_b[i] << endl;
}
ofs.close();
ofs1.close();
ofs2.close();
ofs3.close();

const int batchSize = 30;

double* csrValABatch = (double*)malloc(sizeof(double) * nnzA * batchSize);
double* bBatch = (double*)malloc(sizeof(double) * n * batchSize);
double* xBatch = (double*)malloc(sizeof(double) * n * batchSize);
assert(NULL != csrValABatch);
assert(NULL != bBatch);
assert(NULL != xBatch);

//在 host 上准备 Aj 和 Bj
for (int colidx = 0; colidx < nnzA; colidx++)
{
double Areg = h_csrValA[colidx];
for (int batchId = 0; batchId < batchSize; batchId++)
{
double eps = ((double)((rand() % 100) + 1)) * 1.e-4;
csrValABatch[batchId * nnzA + colidx] = Areg + eps;
}
}
for (int j = 0; j < n; j++)
{
double breg = h_b[j];
for (int batchId = 0; batchId < batchSize; batchId++)
{
double eps = ((double)((rand() % 100) + 1)) * 1.e-4;
bBatch[batchId * n + j] = breg + eps;
}
}

// 创造句柄以及矩阵描述符
cusolver_status = cusolverSpCreate(&cusolverH);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
cusparse_status = cusparseCreateMatDescr(&descrA);
assert(cusparse_status == CUSPARSE_STATUS_SUCCESS);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE); // base-1
cusolver_status = cusolverSpCreateCsrqrInfo(&info);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

// 复制资料进 GPU
int* d_csrRowPtrA = NULL;
int* d_csrColIndA = NULL;
double* d_csrValA = NULL;
double* d_b = NULL; // batchSize * m
double* d_x = NULL;
cudaStat1 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA * batchSize);
cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
cudaStat3 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
cudaStat4 = cudaMalloc((void**)&d_b, sizeof(double) * n * batchSize);
cudaStat5 = cudaMalloc((void**)&d_x, sizeof(double) * n * batchSize);
assert(cudaStat1 == cudaSuccess);
assert(cudaStat2 == cudaSuccess);
assert(cudaStat3 == cudaSuccess);
assert(cudaStat4 == cudaSuccess);
assert(cudaStat5 == cudaSuccess);

cudaStat1 = cudaMemcpy(d_csrValA, csrValABatch, sizeof(double) * nnzA * batchSize, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, h_csrColIndA, sizeof(int) * nnzA,cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, sizeof(int) * (n + 1),cudaMemcpyHostToDevice);
cudaStat4 = cudaMemcpy(d_b, bBatch, sizeof(double) * n * batchSize,cudaMemcpyHostToDevice);
assert(cudaStat1 == cudaSuccess);
assert(cudaStat2 == cudaSuccess);
assert(cudaStat3 == cudaSuccess);
assert(cudaStat4 == cudaSuccess);

// 分析
cusolver_status = cusolverSpXcsrqrAnalysisBatched(cusolverH, n, n, nnzA, descrA, d_csrRowPtrA, d_csrColIndA, info);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

// 准备工作空间
cusolver_status = cusolverSpDcsrqrBufferInfoBatched(cusolverH, n, n, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, batchSize,
info, &size_internal, &size_qr);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);
printf(“numerical factorization needs internal data %lld bytes\n”, (long long)size_internal);
printf(“numerical factorization needs working space %lld bytes\n”, (long long)size_qr);

cudaStat1 = cudaMalloc((void**)&buffer_qr, size_qr);
assert(cudaStat1 == cudaSuccess);

// 数值分解
cusolver_status = cusolverSpDcsrqrsvBatched( cusolverH, n, n, nnzA, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_b, d_x, batchSize, info, buffer_qr);
assert(cusolver_status == CUSOLVER_STATUS_SUCCESS);

// 检查残差
// xBatch = [x0, x1, x2, …]
cudaStat1 = cudaMemcpy(xBatch, d_x, sizeof(double) * n * batchSize, cudaMemcpyDeviceToHost);
assert(cudaStat1 == cudaSuccess);
const int baseA = (CUSPARSE_INDEX_BASE_ONE == cusparseGetMatIndexBase(descrA)) ? 1 : 0;
for (int batchId = 0; batchId < batchSize; batchId++)
{
// measure |bj - Ajxj|
double
csrValAj = csrValABatch + batchId * nnzA;
double* xj = xBatch + batchId * n;
double* bj = bBatch + batchId * n;
// sup| bj - Ajxj|
double sup_res = 0;
for (int row = 0; row < n; row++)
{
const int start = h_csrRowPtrA[row] - baseA;
const int end = h_csrRowPtrA[row + 1] - baseA;
double Ax = 0.0; // Aj(row,:slight_smile:xj
for (int colidx = start; colidx < end; colidx++)
{
const int col = h_csrColIndA[colidx] - baseA;
const double Areg = csrValAj[colidx];
const double xreg = xj[col];
Ax = Ax + Areg * xreg;
}
double r = bj[row] - Ax;
sup_res = (sup_res > fabs(r)) ? sup_res : fabs(r);
}
printf("batchId %d: sup|bj - Aj
xj| = %E \n", batchId, sup_res);
}
for (int batchId = 0; batchId < batchSize; batchId++)
{
double
xj = xBatch + batchId * n;
for (int row = 0; row < n; row++)
{
printf(“x%d[%d] = %E\n”, batchId, row, xj[row]);
}
printf(“\n”);
}

delete csrRowPtr;
delete csrColInd;
delete csrVal;
delete B;
delete h_csrValA;
delete h_csrRowPtrA;
delete h_csrColIndA;
delete h_b;
delete h_x;
if (csrValABatch)free(csrValABatch);
if (bBatch)free(bBatch);
if (xBatch)free(xBatch);
if(d_csrValA)cudaFree(d_csrValA);
if (d_csrRowPtrA)cudaFree(d_csrRowPtrA);
if (d_csrColIndA)cudaFree(d_csrColIndA);
if (d_b)cudaFree(d_b);
if (d_x)cudaFree(d_x);
cusolverSpDestroy(cusolverH);

return 0;
}