#include <stdio.h>
#include <time.h>
int main() {
int Times, Size;
scanf(“%d%d”,&Times, &Size);
float* hmem, *dmem;
cudaHostAlloc(&hmem, Size, cudaHostAllocDefault);
cudaMalloc(&dmem, Size);
cudaStream_t stream;
cudaStreamCreate(&stream);
size_t start = clock();
for (int i=0; i<Times; i++)
cudaMemcpyAsync(dmem, hmem ,Size, stream);
size_t asyncEnd = clock();
cudaThreadSynchronize();
size_t syncEnd = clock();
cudaStreamDestroy(stream);
printf(“%d\n%d\n”,asyncEnd-start, syncEnd-start);
return 0;
}
当指定Times较小时候,比如1000以下,打印出来的asyncEnd-start时间为0, 而当Times大于1030(在我的机器上是这样的sdk 3.0)时候,答应出来的ayncEnd-start开始随着Times线性增大!
这是为什么?