这样的,我按照CUDA学习书上的示例代码,将一个关于矩阵分配的问题。分为了两个代码,主机端代码是example_1.cu,设备端代码是example_1_kernel.cu。
然后在链接中出现这样的错误:
1>------ Build started: Project: maxtrixAssign, Configuration: Debug x64 ------
1>Compiling with CUDA Build Rule…
1>“C:\CUDA\bin64\nvcc.exe” -arch sm_10 -ccbin “c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MTd " -I"C:\CUDA\include” -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc" -maxrregcount=32 --compile -o “x64\Debug\example_1_kernel.cu.obj” “e:\CUDA\Projects\maxtrixAssign\maxtrixAssign\example_1_kernel.cu”
1>example_1_kernel.cu
1>tmpxft_00000960_00000000-3_example_1_kernel.cudafe1.gpu
1>tmpxft_00000960_00000000-8_example_1_kernel.cudafe2.gpu
1>tmpxft_00000960_00000000-3_example_1_kernel.cudafe1.cpp
1>tmpxft_00000960_00000000-12_example_1_kernel.ii
1>Compiling with CUDA Build Rule…
1>“C:\CUDA\bin64\nvcc.exe” -arch sm_10 -ccbin “c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MTd " -I"C:\CUDA\include” -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc" -maxrregcount=32 --compile -o “x64\Debug\example_1.cu.obj” “e:\CUDA\Projects\maxtrixAssign\maxtrixAssign\example_1.cu”
1>example_1.cu
1>tmpxft_0000141c_00000000-3_example_1.cudafe1.gpu
1>tmpxft_0000141c_00000000-8_example_1.cudafe2.gpu
1>tmpxft_0000141c_00000000-3_example_1.cudafe1.cpp
1>tmpxft_0000141c_00000000-12_example_1.ii
1>Linking…
1>example_1_kernel.cu.obj : error LNK2005: _device_stub__Z10testKernelPfS already defined in example_1.cu.obj
1>example_1_kernel.cu.obj : error LNK2005: “void __cdecl testKernel__entry(float *,float *)” (?testKernel__entry@@YAXPEAM0@Z) already defined in example_1.cu.obj
1>E:\CUDA\Projects\maxtrixAssign\x64\Debug\maxtrixAssign.exe : fatal error LNK1169: one or more multiply defined symbols found
1>Build log was saved at “file://e:\CUDA\Projects\maxtrixAssign\maxtrixAssign\x64\Debug\BuildLog.htm”
意思大概就是出现多个重复定义的意思。但是我根据头文件去寻找,并没有发觉重复定义的变量或者函数。这到底是怎么回事啊?头都整大了!
下面是两个文件的源代码:
example_1.cu
/////////////////////////////////////////////////////////
//该代码运行在主机端
/////////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
//using namespact std;
#include <cutil.h>//项目头文件
#include <example_1_kernel.cu>//kernel函数,GPU端代码
//函数声明
void runTest(int argc, char **argv);
//main函数
int main(int argc, char **argv)
{
runTest(argc, argv);
CUT_EXIT(argc, argv);//退出CUDA
}
void runTest(int argc, char **argv)
{
CUT_DEVICE_INIT(argc, argv);//启动CUDA,初始化
unsigned int num_blocks = 4;//定义网格中的线程块数量
unsigned int num_threads = 4;//定义每个线程块中的线程数量
unsigned int mem_size = sizeof(float) * num_threads * num_blocks;//为数据分配的存储器大小,用一个线程来计算一个单精度浮点数
//在host端分配内存,h_ :host端, i : input, o : output
float * h_idata = (float * ) malloc(mem_size);
float * h_odata = (float * ) malloc(mem_size);
//在device端分配显存,d_表示device端
float d_idata;
CUDA_SAFE_CALL(cudaMalloc((void*)&d_idata,mem_size));
float d_odata;
CUDA_SAFE_CALL(cudaMalloc((void*)&d_odata,mem_size));
//初始化内存中的值
for(unsigned int i = 0; i < num_threads * num_blocks; i++)
h_idata[i] = 1.0f;
//将内存中的输入数据读入显存,这样就完成了主机对设备的数据写入
CUDA_SAFE_CALL(cudaMemcpy(d_idata,h_idata,mem_size,cudaMemcpyHostToDevice));
//设置运行参数,即设置网格的形状和线程块的形状
dim3 grid(num_blocks,1,1);
dim3 threads(num_threads,1,1);
//运行内核参数,调用GPU进行运算
testKernel<<<grid,threads,mem_size>>>(d_idata,d_odata);
//检查GPU是否正常运行
CUT_CHECK_ERROR(“Kernel execution failed”);
//将结果从显存写入内存
CUDA_SAFE_CALL(cudaMemcpy(h_odata,d_odata,mem_size,cudaMemcpyDeviceToHost));
//打印结果
for(unsigned int i = 0; i < num_blocks; i++)
{
for(unsigned int j = 0; j < num_threads; j++)
{
//cout<<h_odata[i * num_threads] + j];
printf(“%5.0f”,h_odata[i * num_threads + j]);
}
//cout<<endl;
printf(“\n”);
}
//释放存储器
free(h_idata);
free(h_odata);
CUDA_SAFE_CALL(cudaFree(d_idata));
CUDA_SAFE_CALL(cudaFree(d_odata));
}
这个是设备代码:
#ifndef _EXAMPLE_1_KERNEL_H
#define _EXAMPLE_1_KERNEL_H
global void testKernel(float *g_idata,float *g_odata)
{
//shared memory,其中extern表示大小由host端第三个参数Ns决定
extern shared float sdata;
const unsigned int bid = blockIdx.x;
const unsigned int tid_in_block = threadIdx.x;
const unsigned int tid_in_grid = blockDim.x * blockIdx.x + threadIdx.x;
//按行划分任务时,线程在整个grid中的位置
//将数据从global读入shared memory,读入数据后进行一次同步,保证计算时所有数据均已到位
sdata[tid_in_block] = g_idata[tid_in_grid];
__syncthreads();
//进行计算,需要进行同步,确保写入的数据已经被更新
sdata[tid_in_block] *= (float)bid;
__syncthreads();
//将shared memory中的数据写到global memory
g_odata[tid_in_grid] = sdata[tid_in_block];
}
#endif //ifnedf _EXAMPLE_1_KERNEL_H