下面这段代码用来求0到14的平方,我运行的结果是
0.000000 1.000000 4.000000 9.000000 16.000000 25.000000 36.000000 49.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
后面几组全部变成0了,不知道是为什么
#include<stdlib.h>
#include<string.h>
#include<stdio.h>
#include<cutil.h>
#include <cutil_inline.h>
const int N=15;
float* a;
float* d_a;
__global__ void test(float* vec1, float* res){
extern __shared__ float sh_vec[];
int id=threadIdx.x;
int num=blockDim.x;
int taskPerThread=(N+num-1)/num;
int pos=id*taskPerThread;
for(int i=0;i<taskPerThread;++i){
if(pos+i<N){
sh_vec[pos+i]=vec1[pos+i];
}
}
__syncthreads();
for(int i=0;i<taskPerThread;++i){
if(pos+i<N){
res[pos+i]=sh_vec[pos+i];
}
}
__syncthreads();
return;
}
int main(){
int size=N*sizeof(float);
a=(float*)malloc(size);
for(int i=0;i<N;++i)
a[i]=i*i;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_a,size));
CUDA_SAFE_CALL(cudaMemcpy(d_a,a,size,cudaMemcpyHostToDevice));
dim3 dg(1,1,1);
dim3 db(10,1,1);
test<<<dg,db,0>>>(d_a,d_a);
CUDA_SAFE_CALL(cudaMemcpy(a,d_a,size,cudaMemcpyDeviceToHost));
for(int i=0;i<N;++i){
printf("%f ",a[i]);
}
printf("\n");
free(a);
CUDA_SAFE_CALL(cudaFree(d_a));
return 0;
}