主要结构如下
main()
{
cudaSetDevice(0)
cudaMalloc()
cudaSetDevice(1)
cudaMalloc()
while (i<100000)
{
some cpu code;//只占不超过5%的时间
start_Si2 = clock();
cudaSetDevice(0)
cudaMemcpyAsyncHtoD()
cudaSetDevice(1)
cudaMemcpyAsyncHtoD()
cudaSetDevice(0)
kernel1<<<N,M>>>
kernel2<<<N,M>>>
kernel3<<<N,M>>>
cudaMemcpyAsyncDtoH()
cudaSetDevice(1)
kernel1<<<N,M>>>
kernel2<<<N,M>>>
kernel3<<<N,M>>>
cudaMemcpyAsyncDtoH()
end_Si2=clock()
CPU_Si2=CPU_Si2+(end_Si2-start_Si2);
start_Si1 = clock();
cudaDeviceSynchronize()
end_Si1=clock()
CPU_Si1=CPU_Si1+(end_Si1-start_Si1);
}
}
最后结果是时间百分比
Si1:50-60%
Si2:20-30%
win7 vc2010express+cuda4.0
cudaSetDevice全部设置为cudaSetDevice(0)的运行时间和上述代码时间相同??为什么???
求解!!
再次求解啊!感激不尽!