求助:cudaMemcpyAsync失败了~

想试试cudaMemcpyAsync的分多个流来导数据,编译通过了,但是运行的时候cudaMemcpyAsync函数运行失败了,代码如下:
float* h_signal = (float*)malloc(sizeof(float) * SIGNAL_SIZE);
float* h_signal2 = (float*)malloc(sizeof(float) * SIGNAL_SIZE);
if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
Complex* d_signal;
cutilSafeCall(cudaMalloc((void**)&d_signal, sizeof(float) *SIGNAL_SIZE));

float* compArray = new float[SIGNAL_SIZE];
for(unsigned int Num = 0;Num<ROUND;Num++)
{
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
float x;
x = rand();
h_signal[i] = x;
compArray[i] = x;
}
cudaStream_t stream;
cudaStreamCreate(&stream);
int size=sizeof(float) * SIGNAL_SIZE;
cutilCheckError(cutStartTimer(timer));
cutilCheckError(cutStartTimer(timeri));
cudaMemcpyAsync(d_signal, h_signal,size,cudaMemcpyHostToDevice,stream);//这句
cudaThreadSynchronize();
cutilCheckError(cutStopTimer(timeri));
cutilCheckError(cutStartTimer(timerp));
cutilCheckError(cutStopTimer(timerp));
cutilCheckError(cutStartTimer(timero));
cudaMemcpyAsync(h_signal2,d_signal,size,cudaMemcpyDeviceToHost,stream);//这句
cudaThreadSynchronize();
cutilCheckError(cutStopTimer(timero));
cutilCheckError(cutStopTimer(timer));
cudaStreamDestroy(stream);

}
printf(“GPU Processing time(I): %f (ms) \n”, cutGetTimerValue(timeri));
printf(“GPU Processing time(P): %f (ms) \n”, cutGetTimerValue(timerp));
printf(“GPU Processing time(O): %f (ms) \n”, cutGetTimerValue(timero));
printf(“GPU Processing time(A): %f (ms) \n”, cutGetTimerValue(timer));

for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
fout<<h_signal2[i]<<" “<<compArray[i]<<”\n";
}

fout.close();
fout.clear();

输出的文件里面h_signal2还是初始值不是h_signal里面的数!!!!!!

求高手帮忙啊~~~~谢谢啦!!!!