我简单的谢了一个测试例子(数组想加) 但是输出确实有问题的 似乎函数没有运行 还是 内存出问题??
简单的代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define N 10
global void add(int* a, int* b, int *c)
{
int tid = threadIdx.x;
int tmp;
if (tid < N)
{
tmp = *(a+ tid) + *(b + tid) + 1;
*(c+ tid) = tmp;
}
}
int main(void)
{
int* a = (int *)malloc(N sizeof(int) );
int b = (int *)malloc(N sizeof(int) );
int c = (int *)malloc(N *sizeof(int) );
memset(a, 0, sizeof(int) *N);
memset(b, 0, sizeof(int) *N);
memset(c, 0, sizeof(int) *N);
int *dev_a;
int *dev_b;
int *dev_c;
cudaMalloc((void**)&dev_a, sizeof(int)* N);
cudaMalloc((void**)&dev_b, sizeof(int)* N);
cudaMalloc((void**)&dev_c, sizeof(int)* N);
cudaMemcpy(dev_a, a, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, sizeof(int)*N, cudaMemcpyHostToDevice);
add<<<1,N>>>(a, b, c);
cudaMemcpy(a, dev_a, sizeof(int)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, sizeof(int)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, sizeof(int)*N, cudaMemcpyDeviceToHost);
FILE *fp;
fp = fopen(“text.txt”, “a+”);
for (int i = 0;i < N; i++)
{
fprintf(fp, “\n%d…%d…%d\n”, *a, *b, *c);
}
return 0;
}