#include <stdio.h> #include
using namespace std;
template global void test(int *s_num_pts, int * s_output1, int * s_output2)
{
// The number of warps in a block.
const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warpSize;
// Compute the coordinates of the threads in the block.
const int warp_id = threadIdx.x / warpSize;
const int lane_id = threadIdx.x % warpSize;
//代码1
if( warp_id < 4 )
{
int num_pts = lane_id < NUM_WARPS_PER_BLOCK ? s_num_pts[warp_id4+lane_id] : 0;
int sum = 0; #pragma unroll
for( int offset = 1 ; offset < NUM_WARPS_PER_BLOCK ; offset= 2 )
{
int n = __shfl_up( num_pts, offset, NUM_WARPS_PER_BLOCK );
if( lane_id >= offset )
sum += n;
}
if( lane_id < NUM_WARPS_PER_BLOCK )
s_output1[warp_id4+lane_id] = sum + num_pts;
}
//代码2
if( warp_id < 4 )
{
int num_pts = lane_id < NUM_WARPS_PER_BLOCK ? s_num_pts[warp_id4+lane_id] : 0; #pragma unroll
for( int offset = 1 ; offset < NUM_WARPS_PER_BLOCK ; offset*= 2 )
{
int n = __shfl_up( num_pts, offset, NUM_WARPS_PER_BLOCK );
if( lane_id >= offset )
num_pts += n;
}
if( lane_id < NUM_WARPS_PER_BLOCK )
s_output2[warp_id4+lane_id] = num_pts;
}
}
int main()
{
cudaSetDevice(0);
cudaDeviceProp properties;
cudaGetDeviceProperties( &properties, 0);
int warp_size = properties.warpSize;
const int NUM_THREADS_PER_BLOCK = 128; // Do not use less than 128 threads.
const int NUM_WARPS_PER_BLOCK = NUM_THREADS_PER_BLOCK / warp_size;
int * data = new int [16];
for(int i = 0; i<16; ++i){
data[i]= i + 1;
}
for(int i = 0; i<16; ++i){
cout<<data[i]<<" ";
}
cout<<endl;
int dataout1 = new int [16];
int dataout2 = new int [16];
int input;
cudaMalloc(&input, 16sizeof(int));
cudaMemcpy(input,data,16sizeof(int),cudaMemcpyHostToDevice);
int* output1;
cudaMalloc(&output1, 16sizeof(int));
int output2;
cudaMalloc(&output2, 16sizeof(int));
test<NUM_THREADS_PER_BLOCK> <<<1, NUM_THREADS_PER_BLOCK>>>( input,output1,output2 );
cudaMemcpy(dataout1,output1,16sizeof(int),cudaMemcpyDeviceToHost);
for(int i = 0; i<16; ++i){
cout<<dataout1<<" ";
}
cout<<endl;
cudaMemcpy(dataout2,output2,16*sizeof(int),cudaMemcpyDeviceToHost);
for(int i = 0; i<16; ++i){
cout<<dataout2<<" ";
}
cout<<endl;[/i][/i]
[i][i][i][i] }