我照着别人的例子写了一个cuda程序:
driver+toolkit+sdk已经安装,
#include <stdio.h>
#include <cutil_inline.h>
//define kernel function
global void add(float A,float B,float C)
{
int i=threadIdx.x;
C[i]=A[i]+B;
}[i]
int main()
{
float a={1,2,3,4,5};
float b={5,4,3,2,1};
float c={0,0,0,0,0};
//callback kernel function
add<<<1,5>>>(a,b,c);
for(int i=0;i<5;i++)
printf(“%l”,c);
}[i]
然后make的时候终端提示错误:
main.cu: In function ‘int main()’:
/usr/bin/ld: cannot find -lcutil_i386
/usr/bin/ld: cannot find -lshrutil_i386
collect2: ld returned 1 exit status
make: *** […/…/bin/linux/release/HELLOCUDA] 错误 1
例子错误也是这样
求解。
[/i][/i][/i][/i]
这个例子其实是不完整的
完整的:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
global void add(float a,float b,float c,int N)
{
int tid=blockIdx.x*blockDim.x+threadIdx.x;
while(tid<N)
{
c[tid]=a[tid]+b[tid];
tid+=gridDim.xblockDim.x;
}
}
#define N 100
int main()
{
thread_size=256;
block_size=(N+thread_size-1)/thread_size;
float h_a[N];
float h_b[N];
float h_c[N];
for(int i=0;i<N;i++)
{
h_a[i]=i;
h_b[i]=2i;
}
float d_a,d_b,d_c;
cudaMalloc((void)&d_a,Nsizeof(float));
cudaMalloc((void**)&d_b,Nsizeof(float));
cudaMalloc((void**)&d_c,Nsizeof(float));
cudaMemcpy(d_a,h_a,Nsizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,Nsizeof(float),cudaMemcpyHostToDevice);
add<<<block_size,thread_size>>>(d_a,d_b,d_c,N);
cudaMemcpy(h_c,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
printf(“%f\n”,h_c[i]);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}