版主,你好,昨天看了半天,只能理解CUDA版本的部分代码,请版主讲解一下思路。
版主应该对SDK里的大多数例子都挺熟悉吧?这是SDK里的例子
详细代码可以在 C:\ProgramData\NVIDIA Corporation\CUDA Samples\v5.5\3_Imaging\histogram
//Data type used for input data fetches
typedef uint4 data_t;
//May change on future hardware, so better parametrize the code define SHARED_MEMORY_BANKS 16
////////////////////////////////////////////////////////////////////////////////
// Main computation pass: compute gridDim.x partial histograms
////////////////////////////////////////////////////////////////////////////////
//Count a byte into shared-memory storage
inline device void addByte(uchar *s_ThreadBase, uint data)
{
s_ThreadBase[UMUL(data, HISTOGRAM64_THREADBLOCK_SIZE)]++;
}
//Count four bytes of a word
inline device void addWord(uchar *s_ThreadBase, uint data)
{
//Only higher 6 bits of each byte matter, as this is a 64-bin histogram
addByte(s_ThreadBase, (data >> 2) & 0x3FU); 这里跟CPU版本的相似,找到第几类再乘线程块
addByte(s_ThreadBase, (data >> 10) & 0x3FU); 宽度64的那个位置加1,为什么是64,难道是把
addByte(s_ThreadBase, (data >> 18) & 0x3FU); 类数存在一列里。
addByte(s_ThreadBase, (data >> 26) & 0x3FU);
}
global void histogram64Kernel(uint *d_PartialHistograms, data_t *d_Data, uint dataCount)
{
//Encode thread index in order to avoid bank conflicts in s_Hist access:
//each group of SHARED_MEMORY_BANKS threads accesses consecutive shared memory banks
//and the same bytes [0…3] within the banks
//Because of this permutation block size should be a multiple of 4 * SHARED_MEMORY_BANKS
const uint threadPos =
((threadIdx.x & ~(SHARED_MEMORY_BANKS * 4 - 1)) << 0) | 每个线程对应不同的threadPos
((threadIdx.x & (SHARED_MEMORY_BANKS - 1)) << 2) | 线程0对应threadPos=0,线程1
((threadIdx.x & (SHARED_MEMORY_BANKS * 3)) >> 4); 对应threadPos=4,线程2对应
threadPos=8,应该是避免bank冲突
//Per-thread histogram storage shared uchar s_Hist[HISTOGRAM64_THREADBLOCK_SIZE * HISTOGRAM64_BIN_COUNT];
uchar *s_ThreadBase = s_Hist + threadPos;
声明为字节型共享内存,并且对应的地址有对应的threadPos。
//Initialize shared memory (writing 32-bit words) #pragma unroll
for (uint i = 0; i < (HISTOGRAM64_BIN_COUNT / 4); i++)
{
((uint *)s_Hist)[threadIdx.x + i * HISTOGRAM64_THREADBLOCK_SIZE] = 0;
} 将声明的字节型共享内存转为uint,用4字来初始化,当i=0时,threadIdx.x 0-63,初始化了64个4字了。
//Read data from global memory and submit to the shared-memory histogram
//Since histogram counters are byte-sized, every single thread can’t do more than 255 submission
__syncthreads();
for (uint pos = UMAD(blockIdx.x, blockDim.x, threadIdx.x); pos < dataCount; pos += UMUL(blockDim.x, gridDim.x))
{ 这里应该是读数据进共享内存了,但不怎么明白。
data_t data = d_Data[pos];
addWord(s_ThreadBase, data.x);
addWord(s_ThreadBase, data.y);
addWord(s_ThreadBase, data.z);
addWord(s_ThreadBase, data.w);
}
这下面应该是将共享内存里的数据进行统计吧?
//Accumulate per-thread histograms into per-block and write to global memory
__syncthreads();