学习Cusparse遇到的问题?写出代码能运行没有想要的输出

#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include
using namespace std;

int main()
{
//初始化 cuSparse library
cusparseHandle_t handle = NULL;
cusparseMatDescr_t descrA, descrB, descrC = NULL;
cusparseStatus_t status = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
status = cusparseCreate(&handle);
assert(CUSPARSE_STATUS_SUCCESS == status);

//初始化矩阵描述符
status = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == status);
status = cusparseCreateMatDescr(&descrB);
assert(CUSPARSE_STATUS_SUCCESS == status);
status = cusparseCreateMatDescr(&descrC);
assert(CUSPARSE_STATUS_SUCCESS == status);

const int n = 3;                    // --- 行数与列数
const int nnzA = 4;                 // --- matrix A 的非零个数
const int nnzB = 4;                 // --- matrix B 的非零个数
 //matrix A 的CSR格式
const int csrRowPtrA[n + 1] = { 1, 2, 4, 5 };
const int csrColIndA[nnzA] = { 1, 2, 3, 3 };
const double csrValA[nnzA] = { 1.0, 2.0, 4.0, 5.0 };

//matrix B 的CSR格式
const int csrRowPtrB[n + 1] = { 1, 1, 1, 1 };
const int csrColIndB[nnzB] = { 1, 2, 2, 3 };
const double csrValB[nnzB] = { 1.0, 3.0, 4.0, 5.0 };

//在Cuda上分配matrix A与matrix B的内存及数据转移
int* d_csrRowPtrA;
cudaStat1 = cudaMalloc(&d_csrRowPtrA, sizeof(int) * (n + 1));
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
int* d_csrColIndA;
cudaStat2 = cudaMalloc(&d_csrColIndA, sizeof(int) * nnzA);
assert(cudaSuccess == cudaStat2);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat2);
double* d_csrValA;
cudaStat3 = cudaMalloc(&d_csrValA, sizeof(double) * nnzA);
assert(cudaSuccess == cudaStat3);
cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat3);

int* d_csrRowPtrB;
cudaStat1 = cudaMalloc(&d_csrRowPtrB, sizeof(int) * (n + 1));
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(d_csrRowPtrB, csrRowPtrB, sizeof(int) * (n + 1), cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
int* d_csrColIndB;
cudaStat2 = cudaMalloc(&d_csrColIndB, sizeof(int) * nnzB);
assert(cudaSuccess == cudaStat2);
cudaStat2 = cudaMemcpy(d_csrColIndB, csrColIndB, sizeof(int) * nnzB, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat2);
double* d_csrValB;
cudaStat3 = cudaMalloc(&d_csrValB, sizeof(double) * nnzB);
assert(cudaSuccess == cudaStat3);
cudaStat3 = cudaMemcpy(d_csrValB, csrValB, sizeof(double) * nnzB, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat3);

//matrix A与matrix B 求和
int baseC, nnzC;
double alpha = 1.f, beta = 1.f;
size_t BufferSizeInBytes;
char* buffer = NULL;
int* nnzTotalDevHostPtr = &nnzC;
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
int* d_csrRowPtrC = NULL, * d_csrColIndC = NULL; double* d_csrValC = NULL;
cudaStat1 = cudaMalloc(&d_csrRowPtrC, sizeof(int) * (n + 1));
assert(cudaSuccess == cudaStat1);

cusparseDcsrgeam2_bufferSizeExt(handle, n, n,
	&alpha,
	descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
	&beta,
	descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
	descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
	&BufferSizeInBytes);
cudaMalloc((void**)&buffer, sizeof(char) * BufferSizeInBytes);
status = cusparseXcsrgeam2Nnz(handle, n, n, descrA, nnzA, d_csrRowPtrA, d_csrColIndA, descrB, nnzB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC,
	nnzTotalDevHostPtr, buffer);
if (NULL != nnzTotalDevHostPtr)
{
	nnzC = *nnzTotalDevHostPtr;
}
else
{
	cudaMemcpy(&nnzC, d_csrRowPtrC + n, sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
	nnzC -= baseC;
}
cudaMalloc(&d_csrColIndC, sizeof(int) * nnzC);
cudaMalloc(&d_csrValC, sizeof(double) * nnzC);
status = cusparseDcsrgeam2(
	handle, n, n,
	&alpha,
	descrA, nnzA, d_csrValA, d_csrRowPtrA, d_csrColIndA,
	&beta,
	descrB, nnzB, d_csrValB, d_csrRowPtrB, d_csrColIndB,
	descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC,
	buffer);

//将矩阵C的所有资料往CPU上回传
int* csrRowPtrC = NULL;
int* csrColIndC = NULL;
float* csrValC = NULL;
csrRowPtrC = (int*)malloc(sizeof(int) * (n + 1));
csrColIndC = (int*)malloc(sizeof(int) * nnzC);
csrValC = (float*)malloc(sizeof(float) * nnzC);
assert(NULL != csrRowPtrC);
assert(NULL != csrColIndC);
assert(NULL != csrValC);
cudaStat1 = cudaMemcpy(csrRowPtrC, d_csrRowPtrC, sizeof(int) * (n + 1),cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(csrColIndC, d_csrColIndC, sizeof(int) * nnzC,cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(csrValC, d_csrValC, sizeof(float) * nnzC,cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);

cout << *csrValC << ",";
cout << endl;
cout << *csrColIndC << ",";
cout << endl;
cout << *csrRowPtrC << ",";
cout << endl;

return 0;

}

请问下这个输出结果很怪,是哪里错了?