仿照书上的例子写了多个流的程序,但是用profiler运行后发现并没有overlap,不知道是不是用的有问题,麻烦大家给看一下。程序如下:
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N (1024*1024)
#define FULL_DATASIZE (N*30)
__global__ void kernel(int *a,int *b,int *c)
{
int tid=threadIdx.x+blockIdx.x*blockDim.x;
if(tid < N)
{
int idx1 = (tid + 1) % 256;
int idx2 = (tid + 2) % 256;
float as = (a[tid] + a[idx1] + a[idx2]);
float bs = (b[tid] + b[idx1] + b[idx2]);
c[tid] = (as + bs) / 2;
}
}
int main()
{
cudaDeviceProp prop;
int dev;
cudaGetDevice(&dev);
cudaGetDeviceProperties(&prop,dev);
if(!prop.deviceOverlap)
{
printf("Device doesn't support overlap\n");
return 0;
}
//host
int *a,*b,*c;
cudaHostAlloc((void**)&a, FULL_DATASIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&b, FULL_DATASIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&c, FULL_DATASIZE * sizeof(int), cudaHostAllocDefault);
for(int i = 0; i < FULL_DATASIZE; i++)
{
a[i] = rand();
b[i] = rand();
}
//device
int *dev_a0, *dev_b0, *dev_c0, *dev_a1, *dev_b1, *dev_c1, *dev_a2, *dev_b2, *dev_c2;
cudaMalloc((void**)&dev_a0, N * sizeof(int));
cudaMalloc((void**)&dev_b0, N * sizeof(int));
cudaMalloc((void**)&dev_c0, N * sizeof(int));
cudaMalloc((void**)&dev_a1, N * sizeof(int));
cudaMalloc((void**)&dev_b1, N * sizeof(int));
cudaMalloc((void**)&dev_c1, N * sizeof(int));
cudaMalloc((void**)&dev_a2, N * sizeof(int));
cudaMalloc((void**)&dev_b2, N * sizeof(int));
cudaMalloc((void**)&dev_c2, N * sizeof(int));
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
cudaStream_t stream0, stream1, stream2;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for(int i = 0; i < FULL_DATASIZE; i += 3 * N )
{
cudaMemcpyAsync(dev_a0, a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(dev_a1, a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(dev_a2, a + i + 2 * N, N * sizeof(int), cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(dev_b0, b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(dev_b1, b + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(dev_b2, b + i + 2 * N, N * sizeof(int), cudaMemcpyHostToDevice, stream2);
kernel<<<(N - 255) / 256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);
kernel<<<(N - 255) / 256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);
kernel<<<(N - 255) / 256, 256, 0, stream2>>>(dev_a2, dev_b2, dev_c2);
cudaMemcpyAsync(c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(c + i + 2 * N, dev_c2, N * sizeof(int), cudaMemcpyDeviceToHost, stream2);
//cudaMemcpyAsync(dev_a0, a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
//cudaMemcpyAsync(dev_b0, b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
//kernel<<<(N - 255) / 256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);
//cudaMemcpyAsync(c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
//cudaMemcpyAsync(dev_a1, a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
//cudaMemcpyAsync(dev_b1, b + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
//kernel<<<(N - 255) / 256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);
//cudaMemcpyAsync(c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1);
//cudaMemcpyAsync(dev_a2, a + i + 2 * N, N * sizeof(int), cudaMemcpyHostToDevice, stream2);
//cudaMemcpyAsync(dev_b2, b + i + 2 * N, N * sizeof(int), cudaMemcpyHostToDevice, stream2);
//kernel<<<(N - 255) / 256, 256, 0, stream2>>>(dev_a2, dev_b2, dev_c2);
//cudaMemcpyAsync(c + i + 2 * N, dev_c2, N * sizeof(int), cudaMemcpyDeviceToHost, stream2);
}
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedtime;
cudaEventElapsedTime(&elapsedtime, start, stop);
printf("time used:%3.1fms",elapsedtime);
cudaFree(dev_a0);
cudaFree(dev_b0);
cudaFree(dev_c0);
cudaFree(dev_a1);
cudaFree(dev_b1);
cudaFree(dev_c1);
cudaFree(dev_a2);
cudaFree(dev_b2);
cudaFree(dev_c2);
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream0);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaDeviceReset();
return 0;
}
profiler运行的截图如下:
[attach]3179[/attach]