运行《cuda for Engineers》的dd_ld_shared案例一直出错?求指导

在学习cuda的时候,运行了《CUDA for Engineers an Introduction to High-Performance Parallel Computing》的dd_ld_shared这个案例时,一直出错,这个案例主要是输入一个一维数组,然后每个数组值取其前后两值做个加加减减的运算,然后输出打印,发现打印的值都是初始值,说明这个核没有运行过。然后我在每个cuda函数加了各check_error的函数,发现运行到从显存写出数据到内存的时候,说是非法的内存访问(illeage memory之类的),那么也间接说明了那个核根本没有运算。这个案例用到了动态申请共享内存。所有代码如下:

#include <math.h>
#include <stdio.h>

static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( “%s in %s at line %d\n”, cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, FILE, LINE ))

#define TPB 64
#define RAD 1

global void ddKernel(float *d_out, const float *d_in, int size, float h) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i >= size) return;

const int s_idx = threadIdx.x + RAD;
extern shared float s_in;

// Regular cells
s_in[s_idx] = d_in[i];

// Halo cells
if (threadIdx.x < RAD) {
// careful: the two lines below will also access d_in[-1] and d_in[size+1] which
// are undefined! This bug is fixed in heat_2d (cf. idxClip function)
s_in[s_idx - RAD] = d_in[i - RAD];
s_in[s_idx + blockDim.x] = d_in[i + blockDim.x];
}
__syncthreads();
d_out[i] = (s_in[s_idx-1] - 2.fs_in[s_idx] + s_in[s_idx+1])/(hh);
}

int main(){
const float PI = 3.1415927;
const int N = 150;
const float h = 2*PI/N;

float x[N] = {0.0};
float u[N] = {0.0};
float result_parallel[N] = {0.0};

for(int i=0;i<N;i++)
{
x[i] = 2PIi/N;
u[i] = sinf(x[i]);
}

float d_in = 0;
float d_out = 0;
HANDLE_ERROR(cudaMalloc(&d_in,N
sizeof(float)));
HANDLE_ERROR(cudaMalloc(&d_out,N
sizeof(float)));
HANDLE_ERROR(cudaMemcpy(d_in,&u[0],N*sizeof(float),cudaMemcpyHostToDevice));

const size_t smemSize = (TPB+2*RAD)*sizeof(float);

ddKernel<<<(N+TPB-1)/TPB,TPB,smemSize>>>(d_out,d_in,N,h);

HANDLE_ERROR(cudaMemcpy(result_parallel,d_out,N*sizeof(float),cudaMemcpyDeviceToHost));

HANDLE_ERROR(cudaFree(d_in));
HANDLE_ERROR(cudaFree(d_out));

FILE *outfile = fopen(“results.csv”, “w”);
for (int i = 1; i < N - 1; ++i) {
fprintf(outfile, “%f,%f,%f,%f\n”, x[i], u[i],
result_parallel[i], result_parallel[i] + u[i]);
}
fclose(outfile);

}

我用的显卡RTX2080,TITTAN V都试过,CUDA版本9.1和10.1试过。我想是不是新的显卡或者cuda将什么功能给改变了什么,望高手能指导一下,谢谢。