如下代码是直接在openhero的wizard生成的sample.cu上做的更改,
编译时会出现如下错误,请达人解疑,谢谢!:)
出错的原因应该是mult_函数,但我没看出来为啥
1>sample.cu
1>tmpxft_00000d28_00000000-3_sample.cudafe1.gpu
1>tmpxft_00000d28_00000000-8_sample.cudafe2.gpu
1>### Assertion failure at line 123 of ../../be/cg/NVISA/expand.cxx:
1>### Compiler Error in file C:\Users\CGILab\AppData\Local\Temp/tmpxft_00000d28_00000000-9_sample.cpp3.i during Code_Expansion phase:
1>### unexpected mtype
1>nvopencc ERROR: D:\CUDA\bin/../open64/lib//be.exe returned non-zero status 1
/********************************************************************
* sample.cu
* This is a example of the CUDA program.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
/************************************************************************/
/* Init CUDA */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA.\n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized.\n");
return true;
}
#endif
//矩阵相乘,m x t 的矩阵 与 t x n 的矩阵相乘,得到m x n 的矩阵
__device__ void mult_ (float *r, const float*x, const float*y, int m, int t, int n)
{
int i, j, k;
for(i = 0; i < m; i ++)
{
for(j = 0; j < n; j ++)
{
r[i * n + j] = 0;
for(k = 0; k < t; k ++)
r[i * n + j] += x[i * t + k] * y[j + k * t];
}
}
}
__device__ float Depth_LineCross3d(float li[3], float lc[3], float lipt[3], float lcpt[3])
{
float depth = 1;
bool bSameDir = true;
float dirc[3] = {0,}, diri[3] = {0,}, test[3][3] = {1,};
mult_(li, *test, lc, 3, 3, 1);
float fc = lc[0] * dirc[0] + lc[1] * dirc[1] + lc[2] * dirc[2];
float fi = li[0] * diri[0] + li[1] * diri[1] + li[2] * diri[2];
if( ( bSameDir && (fc < 0 || fi > 0 ) )
|| ((!bSameDir) && (fi > 0 ) ))
depth = 10000;
return depth;
}
__global__ void IBR_Kernel(char *output, int iImgNum, int width, int height)
{
// unsigned int z = 0;
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x >= width || y >= height)
return;
float minDepth = 100000,
maxOfMin = 0;
float li[3] = {0,0,0}, lc[3] = {0,0,0}, lipt[3] = {0,0,0}, lcpt[3] = {0,0,0};
float depth = Depth_LineCross3d(li, lc, lipt, lcpt);
if(depth < minDepth)
minDepth = depth;
if(minDepth > maxOfMin)
maxOfMin = minDepth;
//先找到最近的参考图像,这里在CPU中完成了
//将maxOfMin的深度映射回最近的参考图像
output[y * width + x] = maxOfMin;// = {200, 0, 0, 0};//tex3D(texRef3D, x, y, z);
}
/************************************************************************/
/* HelloCUDA */
/************************************************************************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
char *device_result = 0;
char host_result[12] ={0};
cutilSafeCall( cudaMalloc((void**) &device_result, sizeof(char) * 11));
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
IBR_Kernel<<<1, 1, 0>>>(device_result, 11, 0, 0);
cutilCheckMsg("Kernel execution failed\n");
cudaThreadSynchronize();
cutilCheckError( cutStopTimer( timer));
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));
cutilSafeCall( cudaMemcpy(host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));
printf("%s\n", host_result);
cutilSafeCall( cudaFree(device_result));
return 0;
}
[ 本帖最后由 lyso3 于 2010-3-11 16:11 编辑 ]