请问编译错误 unexpected mtype (附完全代码)

如下代码是直接在openhero的wizard生成的sample.cu上做的更改,
编译时会出现如下错误,请达人解疑,谢谢!:)

出错的原因应该是mult_函数,但我没看出来为啥

1>sample.cu
1>tmpxft_00000d28_00000000-3_sample.cudafe1.gpu
1>tmpxft_00000d28_00000000-8_sample.cudafe2.gpu
1>### Assertion failure at line 123 of ../../be/cg/NVISA/expand.cxx:
1>### Compiler Error in file C:\Users\CGILab\AppData\Local\Temp/tmpxft_00000d28_00000000-9_sample.cpp3.i during Code_Expansion phase:
1>### unexpected mtype
1>nvopencc ERROR: D:\CUDA\bin/../open64/lib//be.exe returned non-zero status 1
/********************************************************************
*  sample.cu
*  This is a example of the CUDA program.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
/************************************************************************/
/* Init CUDA                                                            */
/************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
 int count = 0;
 int i = 0;
 cudaGetDeviceCount(&count);
 if(count == 0) {
  fprintf(stderr, "There is no device.\n");
  return false;
 }
 for(i = 0; i < count; i++) {
  cudaDeviceProp prop;
  if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
   if(prop.major >= 1) {
   break;
   }
  }
 }
 if(i == count) {
  fprintf(stderr, "There is no device supporting CUDA.\n");
  return false;
 }
 cudaSetDevice(i);
 printf("CUDA initialized.\n");
 return true;
}
#endif
//矩阵相乘,m x t 的矩阵 与 t x n 的矩阵相乘,得到m x n 的矩阵
__device__ void mult_ (float *r, const float*x, const float*y, int m, int t, int n)
{
 int i, j, k;
 for(i = 0; i < m; i ++)
 {
  for(j = 0; j < n; j ++)
  {
   r[i * n + j] = 0;
   for(k = 0; k < t; k ++)
   r[i * n + j] += x[i * t + k] * y[j + k * t];
  }
 }
}
__device__ float Depth_LineCross3d(float li[3], float lc[3], float lipt[3], float lcpt[3])
{
 float depth = 1;
 bool bSameDir = true;
 float dirc[3] = {0,}, diri[3] = {0,}, test[3][3] = {1,};
 mult_(li, *test, lc, 3, 3, 1);
 float fc = lc[0] * dirc[0] + lc[1] * dirc[1] + lc[2] * dirc[2];
 float fi = li[0] * diri[0] + li[1] * diri[1] + li[2] * diri[2];
 if(  ( bSameDir  && (fc < 0 || fi > 0 ) )
  || ((!bSameDir) && (fi > 0    ) ))
  depth = 10000;
 return depth;
}
__global__ void IBR_Kernel(char *output, int iImgNum, int width, int height)
{
 //  unsigned int z = 0;
 const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
 const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
 if(x >= width || y >= height)
  return;

 float minDepth = 100000,
  maxOfMin = 0;
 float li[3] = {0,0,0}, lc[3] = {0,0,0}, lipt[3] = {0,0,0}, lcpt[3] = {0,0,0};
 float depth = Depth_LineCross3d(li, lc, lipt, lcpt); 
 if(depth < minDepth)
  minDepth = depth;
 if(minDepth > maxOfMin)
  maxOfMin = minDepth;
 //先找到最近的参考图像,这里在CPU中完成了
 //将maxOfMin的深度映射回最近的参考图像
 output[y * width + x] = maxOfMin;// = {200, 0, 0, 0};//tex3D(texRef3D, x, y, z);
}

/************************************************************************/
/* HelloCUDA                                                            */
/************************************************************************/
int main(int argc, char* argv[])
{
 if(!InitCUDA()) {
  return 0;
 }
 char *device_result = 0;
 char host_result[12] ={0};
 cutilSafeCall( cudaMalloc((void**) &device_result, sizeof(char) * 11));
 unsigned int timer = 0;
 cutilCheckError( cutCreateTimer( &timer));
 cutilCheckError( cutStartTimer( timer));
 IBR_Kernel<<<1, 1, 0>>>(device_result, 11, 0, 0);
 cutilCheckMsg("Kernel execution failed\n");
 cudaThreadSynchronize();
 cutilCheckError( cutStopTimer( timer));
 printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
 cutilCheckError( cutDeleteTimer( timer));
 cutilSafeCall( cudaMemcpy(host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));
 printf("%s\n", host_result);
 cutilSafeCall( cudaFree(device_result));
 return 0;
}

[ 本帖最后由 lyso3 于 2010-3-11 16:11 编辑 ]