__global__ void MyKernel(float *arrSd,float *arrEd,float *arrRd,int n)
{
int i=threadIdx.x;
int j=threadIdx.y;
int bid=blockIdx.x*blockDim.x;
int id=j*blockDim.x+i;
__shared__ float mid[64][2];
mid[i][j]=0;
if(id<n)
{
mid[i][j]=arrSd[id]-arrEd[id];
mid[i][j]=mid[i][j]*mid[i][j];
__syncthreads();
if(j==0)
{
mid[i][j]=mid[i][j]+mid[i][j+1];
arrRd[i+bid]=sqrtf(mid[i][j]);
}
}
}
int main()
{
cudaEvent_t sd,ed;
int streamSize=512;
const int n=1024;
float *arrS=0,*arrE=0,*arrR=0;
arrS=(float *)malloc(sizeof(float)*n);
arrE=(float *)malloc(sizeof(float)*n);
arrR=(float *)malloc(sizeof(float)*n);
const int nstreams=2;
int size=sizeof(float)*n;
for(int i=0;i!=n;i++)
{
arrS[i]=i+1;
arrE[i]=i;
arrR[i]=0;
}
cudaStream_t stream[nstreams];
for(int i=0;i!=nstreams;++i)
cudaStreamCreate(&stream[i]);
float *arrSd,*arrEd,*arrRd;
cudaMalloc(&arrSd,size);
cudaMalloc(&arrEd,size);
cudaMalloc(&arrRd,size/2);
dim3 threads,blocks;
threads=dim3(64,2);
blocks=dim3(streamSize/(2*threads.x),1);
int sharesize=sizeof(float)*threads.x*2;
cudaEventCreate(&sd);
cudaEventCreate(&ed);
cudaEventRecord(sd,0);
for(int i=0;i!=nstreams;++i)
{
cudaMemcpyAsync(arrSd+i*streamSize,arrS+i*streamSize,sizeof(float)*streamSize,cudaMemcpyHostToDevice,stream[i]);
cudaMemcpyAsync(arrEd+i*streamSize,arrE+i*streamSize,sizeof(float)*streamSize,cudaMemcpyHostToDevice,stream[i]);
MyKernel<<<blocks,threads,sharesize,stream[i]>>>(arrSd+i*n/nstreams,arrEd+i*n/nstreams,arrRd+i*n/(nstreams*2),n);
cudaMemcpyAsync(arrR+i*streamSize/2,arrRd+i*streamSize/2,sizeof(float)*streamSize/2,cudaMemcpyDeviceToHost,stream[i]);
}
cudaDeviceSynchronize();
cudaEventRecord(ed,0);
cudaEventSynchronize(ed);
float tid;
cudaEventElapsedTime(&tid,sd,ed);
for(int i=0;i!=nstreams;i++)
cudaStreamDestroy(stream[i]);
for(int i=0;i!=n/2;i++)
{
printf("%f ",arrR[i]);
}
printf(",%f,%f",ti,tid);
cudaEventDestroy(sd);
cudaEventDestroy(ed);
cudaFree(arrSd);
cudaFree(arrEd);
cudaFree(arrRd);
free(arrS);
free(arrE);
free(arrR);
int i=0;
scanf("%d",&i);
}
其实,我这个是毕业设计,要求实现向量距离的大规模并行计算,现在做成这样,但是加上数据传输怎么都比CPU慢…我已经不知道怎么办了…请各位达人支支招.
还有想问那个zero copy到底是怎么用的,我一直看不懂,可以和流一起用吗?除此之外还有别的办法么