#include <stdio.h>
#include <stdlib.h>
#include
#include <cuda.h>
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
using namespace std;
#define BLOCK_SIZE 32
#define GRID_SIZE 14
// Kernel definition
global void VecAdd(int* A, int* B, int* C) {
int i =threadIdx.x;
int j =blockIdx.x;
int n =32*j+i;
for (int j = 0; j < 100000000 ; j++ ){
C[i] =A[i] + B[i];
}
}
int main() {
size_t size =BLOCK_SIZE*GRID_SIZE*sizeof(int);
int* A =(int*)malloc(size);
int* B =(int*)malloc(size);
int* C =(int*)malloc(size);
for (int i =0 ; i < BLOCK_SIZE*GRID_SIZE ; ++i) {
A[i]=(int)i;
B[i]=(int)i;
}
int* gA;
cudaMalloc((void**)&gA,size);
cudaMemcpy(gA,A,size,cudaMemcpyHostToDevice);
int* gB;
cudaMalloc((void**)&gB,size);
cudaMemcpy(gB,B,size,cudaMemcpyHostToDevice);
int* gC;
cudaMalloc((void**)&gC,size);
dim3grid(GRID_SIZE , 1 , 1);
dim3block(BLOCK_SIZE , 1 , 1);
VecAdd<<<grid, block>>>(gA, gB, gC);
cudaMemcpy(C,gC,size,cudaMemcpyDeviceToHost);
for (intj=0;j<BLOCK_SIZE*GRID_SIZE;++j) {
printf("%d\t%d\n",C[j],j);
}
cudaFree(gA);
cudaFree(gB);
cudaFree(gC);
free(A);
free(B);
free(C);
return 0;
}
以上为程序的源代码,在windows7 的VS2010中编译通过,但是结果不正确:
[attach]3038[/attach]
正确结果应该是每一行的第一个数是第二个数的两倍,求解释??
然后就是,我在linux下编译这段代码的时候,是可以输出正确结果的,但是有个问题就是,系统重启之后,直接运行.out文件的话,输出的结果就变成了全是0,这时候要运行另外一段代码,然后再运行.out文件才能出正确结果,求解释其原理??