初学遇到疑问

我搭建了一个平台后在github上找了个例子试运行运行不了,我已经加载了cublas和cusparse这两个需要加载的库了.希望有人可以帮忙解答问题,因为我附近的人没人学过这个

我在这个网站上找到的代码想去跑跑,这个代码应该在linux系统中编写的

#include <cuda_runtime.h>
#include <stdio.h>
#include "cusparse.h"
#include "cublas_v2.h"
#include <time.h>
#ifdef _WIN32
#include <windows.h>
#else
#include <sys/time.h>
#endif

#define CLEANUP(s){\
	printf("%s\n", s);\
	if(h_valA) free(h_valA);\
	if(h_csrRowPtrA) free(h_csrRowPtrA);\
	if(h_csrColIndA) free(h_csrColIndA);\
	if(valA) cudaFree(valA);\
	if(csrRowPtrA) cudaFree(csrRowPtrA);\
	if(csrColIndA) cudaFree(csrColIndA);\
	if(h_b) free(h_b);\
	if(b) cudaFree(b);\
	if(r) cudaFree(r);\
	if(p) cudaFree(p);\
	if(Ap) cudaFree(Ap);\
	if(h_x) free(h_x);\
	if(x) cudaFree(x);\
	if(handle1) cublasDestroy(handle1);\
	if(descrA) cusparseDestroyMatDescr(descrA);\
	if(handle2) cusparseDestroy(handle2);\
	cudaDeviceReset();\
}

//用于统计程序时间
double cpuSecond(){
	struct timeval tp;
	gettimeofday(&tp, NULL);
	return ((double)tp.tv_sec + (double)tp.tv_usec * 1e-6);
}

int main(){
	//initialization
	cudaError_t cudaStat1, cudaStat2, cudaStat3;	

	double* h_valA = 0;
	int* h_csrRowPtrA = 0;
	int* h_csrColIndA = 0;
	double* valA = 0;
	int* csrRowPtrA = 0;
	int* csrColIndA = 0;
	double* h_b = 0;
	double* b = 0;
	double* r = 0;
	double* p = 0;
	double* Ap = 0;
	double* h_x = 0;
	double* x = 0;

	cublasStatus_t status1;
	cublasHandle_t handle1 = 0;
	cusparseStatus_t status2;
	cusparseHandle_t handle2 = 0;
	cusparseMatDescr_t descrA = 0;

	int n, nnz;
	double dzero = 0.0;
	double done = 1.0;
	/*
	use CG to solve following linear system:
		Ax = b;
		A = |3 0 2|
			|0 2 0|
			|2 0 1|
		b = |3.5|
			|1.5|
			|2.0|
	correct result x:
		x = |0.50|
			|0.75|
			|1.00|
	   */
	n = 3;
	nnz = 5;
	//which device(gpu) to use
	cudaStat1 = cudaSetDevice(0);
	if(cudaStat1 != cudaSuccess){
		CLEANUP("Device Set failed");
		return EXIT_FAILURE;
	}

	//matrix A(csr format) in host
	h_valA = (double*)malloc(nnz*sizeof(double));
	h_csrRowPtrA = (int*)malloc((n+1)*sizeof(int));
	h_csrColIndA = (int*)malloc(nnz*sizeof(int));
	if(!h_valA||!h_csrRowPtrA||!h_csrColIndA){
		CLEANUP("Host malloc failed(A)");
		return EXIT_FAILURE;
	}

	h_valA[0] = 3.0;
	h_valA[1] = 2.0;
	h_valA[2] = 2.0;
	h_valA[3] = 2.0;
	h_valA[4] = 1.0;
	
	h_csrRowPtrA[0] = 0;
	h_csrRowPtrA[1] = 2;
	h_csrRowPtrA[2] = 3;
	h_csrRowPtrA[3] = 5;
	
	h_csrColIndA[0] = 0;
	h_csrColIndA[1] = 2;
	h_csrColIndA[2] = 1;
	h_csrColIndA[3] = 0;
	h_csrColIndA[4] = 2;

	//matrix A(csr format) in device
	cudaStat1 = cudaMalloc((void**)&valA, nnz*sizeof(double));
	cudaStat2 = cudaMalloc((void**)&csrRowPtrA, (n+1)*sizeof(int));
	cudaStat3 = cudaMalloc((void**)&csrColIndA, nnz*sizeof(int));
	if(cudaStat1 != cudaSuccess || cudaStat2 != cudaSuccess || cudaStat3 != cudaSuccess){
		CLEANUP("device malloc failed(A)");
		return EXIT_FAILURE;
	}
	cudaStat1 = cudaMemcpy(valA, h_valA, (size_t)(nnz*sizeof(double)), cudaMemcpyHostToDevice);
	cudaStat2 = cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (size_t)((n+1)*sizeof(int)), cudaMemcpyHostToDevice);
	cudaStat3 = cudaMemcpy(csrColIndA, h_csrColIndA, (size_t)(nnz*sizeof(int)), cudaMemcpyHostToDevice);
	if(cudaStat1 != cudaSuccess || cudaStat2 != cudaSuccess || cudaStat3 != cudaSuccess){
		CLEANUP("memcpy from host to device failed(A)");
		return EXIT_FAILURE;
	}

	//vector b in host
	h_b = (double*)malloc(n*sizeof(double));
	h_b[0] = 3.5;
	h_b[1] = 1.5;
	h_b[2] = 2.0;

	//vector b in device
	cudaStat1 = cudaMalloc((void**)&b, n*sizeof(double));
	if(cudaStat1 != cudaSuccess){
		CLEANUP("device malloc failed(b)");
		return EXIT_FAILURE;
	}
	cudaStat1 = cudaMemcpy(b, h_b, (size_t)(n*sizeof(double)), cudaMemcpyHostToDevice);
	if(cudaStat1 != cudaSuccess){
		CLEANUP("memcpy from host to device failed(b)");
		return EXIT_FAILURE;
	}

	//vector r in device
	cudaStat1 = cudaMalloc((void**)&r, n*sizeof(double));
	if(cudaStat1 != cudaSuccess){
		CLEANUP("device malloc failed(r)");
		return EXIT_FAILURE;
	}

	//vector p in device
	cudaStat1 = cudaMalloc((void**)&p, n*sizeof(double));
	if(cudaStat1 != cudaSuccess){
		CLEANUP("device malloc failed(p)");
		return EXIT_FAILURE;
	}

	//vector Ap in device
	cudaStat1 = cudaMalloc((void**)&Ap, n*sizeof(double));
	if(cudaStat1 != cudaSuccess){
		CLEANUP("device malloc failed(Ap)");
		return EXIT_FAILURE;
	}

	//vector x in host
	h_x = (double*)malloc(n*sizeof(double));
	for(int i=0; i<n; ++i)
		h_x[i] = 0.0;

	//vector x in device
	cudaStat1 = cudaMalloc((void**)&x, n*sizeof(double));
	if(cudaStat1 != cudaSuccess){
		CLEANUP("device malloc faile(x)");
		return EXIT_FAILURE;
	}
	cudaStat1 = cudaMemcpy(x, h_x, (size_t)(n*sizeof(double)), cudaMemcpyHostToDevice);
	if(cudaStat1 != cudaSuccess){
		CLEANUP("memcpy from host to device failed(x)");
		return EXIT_FAILURE;
	}

	//initialize cublas library
	status1 = cublasCreate(&handle1);
	if(status1 != CUBLAS_STATUS_SUCCESS){
		CLEANUP("CUBLAS initialize failed");
		return EXIT_FAILURE;
	}
	//initialize cusparse library
	status2 = cusparseCreate(&handle2);
	if(status2 != CUSPARSE_STATUS_SUCCESS){
		CLEANUP("CUSPARSE Library initialize failed");
		return EXIT_FAILURE;
	}
	//initialize matrix descriptor
	status2 = cusparseCreateMatDescr(&descrA);
	if(status2 != CUSPARSE_STATUS_SUCCESS){
		CLEANUP("Matrix descriptor initialize failed");
		return EXIT_FAILURE;
	}
	cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
	cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);


	//CG
	//1: 
	//max iteration number = 2000
	//relative residual = 1e-7
	//compute initial residual r = b - Ax_0
	//x_0 = |0|
	//		|0|
	//		|0|
	//initial p = r
	int maxit = 2000;
	double tol = 1e-7;
	double alpha, beta, rhop, rho;
	//r = b
	status1 = cublasDcopy(handle1, n, b, 1, r, 1);
	if(status1 != CUBLAS_STATUS_SUCCESS){
		CLEANUP("Vector copy failed(r)");
		return EXIT_FAILURE;
	}
	//p = b
	status1 = cublasDcopy(handle1, n, b, 1, p, 1);
	if(status1 != CUBLAS_STATUS_SUCCESS){
		CLEANUP("Vector copy failed(p)");
		return EXIT_FAILURE;
	}
	status1 = cublasDnrm2(handle1, n, r, 1, &rho);
	if(status1 != CUBLAS_STATUS_SUCCESS){
		CLEANUP("compute norm2 failed(r)");
		return EXIT_FAILURE;
	}
	rho = rho*rho;

	//2: repeat until convergence
	for(int i=0; i<maxit; ++i){
		status2 = cusparseDcsrmv(handle2, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnz, &done, 
									descrA, valA, csrRowPtrA, csrColIndA, p, &dzero, Ap);
		if(status2 != CUSPARSE_STATUS_SUCCESS){
			CLEANUP("compute mv failed(Ap)");
			return EXIT_FAILURE;
		}
		double tmp;
		status1 = cublasDdot(handle1, n, p, 1, Ap, 1, &tmp);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute dot failed(tmp)");
			return EXIT_FAILURE;
		}
		//alpha = (r^T*r)/(p^T*A*p)
		alpha = rho/tmp;
		//x = x + alpha*p
		status1 = cublasDaxpy(handle1, n, &alpha, p, 1, x, 1);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute axpy failed(x)");
			return EXIT_FAILURE;
		}
		double tmp2 = -alpha;
		//r = r - alpha*A*p
		status1 = cublasDaxpy(handle1, n, &tmp2, Ap, 1, r, 1);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute axpy failed(r)");
			return EXIT_FAILURE;
		}

		rhop = rho;
		status1 = cublasDnrm2(handle1, n, r, 1, &rho);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute norm2 failed(r)");
			return EXIT_FAILURE;
		}
		if(rho < tol){
			break;
		}
		rho = rho*rho;
		
		//beta = rho/rhop
		beta = rho/rhop;
		
		//p = r + beta*p
		status1 = cublasDscal(handle1, n, &beta, p, 1);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute scal failed(p)");
			return EXIT_FAILURE;
		}
		status1 = cublasDaxpy(handle1, n, &done, r, 1, p, 1);
		if(status1 != CUBLAS_STATUS_SUCCESS){
			CLEANUP("compute axpy failed(p)");
			return EXIT_FAILURE;
		}
	}
	
	//copy x to h_x, and output h_x
	cudaStat1 = cudaMemcpy(h_x, x, (size_t)(n*sizeof(double)), cudaMemcpyDeviceToHost);
	if(cudaStat1 != cudaSuccess){
		CLEANUP("memcpy from device to host failed(x)");
		return EXIT_FAILURE;
	}
	for(int i=0; i<n; ++i)
		printf("%f\n", h_x[i]);

	CLEANUP("Success!");
	return 0;
}