[quote=“system”, post: 2, topic: 3278]
LZ您好:
首先您这个用法不是“二维数组”,而是一个一维的数组里面元素是指针,然后每个指针指向一个申请 …[/quote]
多谢版主回答
按照一维缓冲区的做法是不是下面这样,定义一维的,
但是运行出来,好像结果GPU结果不对,是为什么呢
include <stdio.h>
include “cuda_runtime.h”
include “device_launch_parameters.h”
include
define M 2000
define N 3000
global void matrixAdd(float A, float B, float C)
{
int i=threadIdx.x+blockIdx.x32;
int j=threadIdx.y+blockIdx.y32;
C[jN+i]=A[jN+i]+B[jN+i];
}
int main()
{
float a,b,c;
cudaMallocHost((void)&a ,NMsizeof(float));
cudaMallocHost((void**)&b ,NMsizeof(float));
cudaMallocHost((void**)&c ,NM*sizeof(float));
float *A,*B,*C;
cudaMalloc((void **) &A, sizeof(float)MN);
cudaMalloc((void **) &B, sizeof(float)MN);
cudaMalloc((void **) &C, sizeof(float)MN);
for (int j=0;j<M;j++)
{
for (int i=0;i<N;i++)
{
a[jN+i]=i+j;
b[jN+i]=i+2*j;
}
}
cudaMemcpy(A, a, sizeof(float)MN, cudaMemcpyHostToDevice);
cudaMemcpy(B, b, sizeof(float)MN, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 dimGrid( (N + threadsPerBlock.x - 1)/threadsPerBlock.x ,(M + threadsPerBlock.y - 1)/threadsPerBlock.y ) ;
matrixAdd<<<dimGrid, threadsPerBlock>>>(A,B,C);
cudaMemcpy(c, C, sizeof(float)MN, cudaMemcpyDeviceToHost);
printf(“done!\n”);
for(int r=0;r<M;r++){
for(int co=0;co<N;co++){
if (c[rN+co]!=a[rN+co]+b[r*N+co])
{ printf(“wrong!\n”);
}
}
}
cudaFree(A);
cudaFree(B);
cudaFree(C);
return 0;
}