linux(ubuntu)+cuda4.0

我照着别人的例子写了一个cuda程序:
driver+toolkit+sdk已经安装,
#include <stdio.h>
#include <cutil_inline.h>
//define kernel function
global void add(float A,float B,float C)
{
int i=threadIdx.x;
C[i]=A[i]+B;
}
[i]
int main()
{
float a={1,2,3,4,5};
float b={5,4,3,2,1};
float c={0,0,0,0,0};
//callback kernel function
add<<<1,5>>>(a,b,c);
for(int i=0;i<5;i++)
printf(“%l”,c);
}
[i]

然后make的时候终端提示错误:

main.cu: In function ‘int main()’:
/usr/bin/ld: cannot find -lcutil_i386
/usr/bin/ld: cannot find -lshrutil_i386
collect2: ld returned 1 exit status
make: *** […/…/bin/linux/release/HELLOCUDA] 错误 1
例子错误也是这样

求解。
[/i][/i][/i][/i]

需要用-L指定这两个链接库的路径。

好东西,顶一下

这个例子其实是不完整的

完整的:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>

global void add(float a,float b,float c,int N)
{
int tid=blockIdx.x*blockDim.x+threadIdx.x;
while(tid<N)
{
c[tid]=a[tid]+b[tid];

tid+=gridDim.xblockDim.x;
}
}
#define N 100
int main()
{
thread_size=256;
block_size=(N+thread_size-1)/thread_size;
float h_a[N];
float h_b[N];
float h_c[N];
for(int i=0;i<N;i++)
{
h_a[i]=i;
h_b[i]=2
i;
}
float d_a,d_b,d_c;
cudaMalloc((void
)&d_a,N
sizeof(float));
cudaMalloc((void**)&d_b,Nsizeof(float));
cudaMalloc((void**)&d_c,N
sizeof(float));
cudaMemcpy(d_a,h_a,Nsizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,N
sizeof(float),cudaMemcpyHostToDevice);
add<<<block_size,thread_size>>>(d_a,d_b,d_c,N);
cudaMemcpy(h_c,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
printf(“%f\n”,h_c[i]);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}