多线程调用cuda的dll库函数问题

我在visualstudio2005环境下编写C#程序,程序中我开了4个线程,每个线程都去调用cuda的dll里边的同一个函数,然后监视运算时间为33秒,作为对比我用单线程我反复的调用该dll里边的那个函数4次,发现用时间为11秒多,为什么多线程反而更耗时?(4核cpu gtx260显卡)

原代码如下:

namespace ThreadTest0406
{

class Program
{
[DllImport(“sumtest.dll”)]
static extern int SUMVALUE();

static int result1, result2, result3, result4;
static void Main(string args)
{
Stopwatch sw = new Stopwatch();
int usetime;

Thread thread1 = new Thread(Fun1);
Thread thread2 = new Thread(Fun2);
Thread thread3 = new Thread(Fun3);
Thread thread4 = new Thread(Fun4);

sw.Reset();
sw.Start();
thread1.Start();
thread2.Start();
thread3.Start();
thread4.Start();

thread1.Join();
thread2.Join();
thread3.Join();
thread4.Join();
sw.Stop();
usetime = (int)sw.ElapsedMilliseconds;
Console.WriteLine(“MultThread:”+usetime.ToString());
sw.Reset();
sw.Start();
result1 = SUMVALUE();
result2 = SUMVALUE();
result3 = SUMVALUE();
result4 = SUMVALUE();
sw.Stop();
usetime = (int)sw.ElapsedMilliseconds;
Console.WriteLine(“SingleThread:”+usetime.ToString());
Console.Read();

}

static void Fun1()
{
result1 = SUMVALUE();
//Console.WriteLine(“result1=” + result1.ToString());
}
static void Fun2()
{
result2 = SUMVALUE();
//Console.WriteLine(“result2=” + result2.ToString());
}
static void Fun3()
{
result3 = SUMVALUE();
//Console.WriteLine(“result3=” + result3.ToString());
}
static void Fun4()
{
result4 = SUMVALUE();
//Console.WriteLine(“result4=” + result4.ToString());
}
}
}