#ifndef DEVICE_EMULATION
#define LOG2_WARP_SIZE 5U
#else
#define LOG2_WARP_SIZE 0
#endif
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
//Both map to single instructions on G8x / G9x / G10x
#define UMUL(a, b) __umul24( (a), (b) )
#define UMAD(a, b, c) ( UMUL((a), (b)) + (c) )
//{1 x LOG2_WARP_SIZE} : {0 x (UINT_SIZE - LOG2_WARP_SIZE)
#define TAG_MASK 0x07FFFFFFU
//Warps == subhistograms per threadblock
#define WARP_COUNT 6
//Threadblock size
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)
/ d memory per threadblock
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
global void histogram256Kernel(uint *d_PartialHistograms, uint *d_Data, uint dataCount){
//Per-warp subhistogram storage
shared histogram_t s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
histogram_t *s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
//Clear shared memory storage for current threadblock before processing
#pragma unroll
for(uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
/ cle through the entire data set, update subhistograms for each warp
#ifndef DEVICE_EMULATION
const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);
#else
const uint tag = 0;
#endif
__syncthreads();
for(uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; pos += UMUL(blockDim.x, gridDim.x)){
uint data = d_Data[pos];
addWord(s_WarpHist, data, tag);
}
//Merge per-warp histograms into per-block and write to global memory
__syncthreads();
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE){
uint sum = 0;
for(uint i = 0; i < WARP_COUNT; i++)
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
}
}
哪个大侠帮我解释一下这段代码啊。看不懂意思。
主要是几点:一是 block的个数和大小怎么定才好?warp和block到底是怎么个关系,怎么隐射?
二、对于 share memory 怎么进行同步,怎么划分到每个thread 对应。据说是share memory会划分成多个bank,而每个block中的thread会远大于bank的数量,怎么个隐射和控制才好
三、能否解释一下一面这句
for(uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; pos += UMUL(blockDim.x, gridDim.x)){
uint data = d_Data[pos];
addWord(s_WarpHist, data, tag);
}
四、DEVICE_EMULATION 是什么意思,代表什么。
代码是 cuda 的实例代码。非常非常感谢。小弟初学cuda才几天,很多不懂,
要是哪个能详解解释一下程序,不胜感激