本人优化了一个用c语言写的直方图均衡化算法,c代码如下:
/*************************************************************************
*
- 函数名称:
- InteEqualize()
- 参数:
- LPSTR lpDIBBits - 指向源DIB图像指针
- LONG lWidth - 源图像宽度(象素数)
- LONG lHeight - 源图像高度(象素数)
- 返回值:
- BOOL - 成功返回TRUE,否则返回FALSE。
- 说明:
- 该函数用来对图像进行直方图均衡。
***********************************************************************/
bool InteEqualize(unsigned char pSrc, int lWidth, int lHeight)
{
int lLineBytes = lWidth; // 图像每行的字节数
unsigned char *srctmp=pSrc;
long lTemp = 0; // 临时变量
int i = 0; // 循环变量
int j = 0;
BYTE bMap[256];// 灰度映射表
long lCount[256];// 灰度计数表
// lLineBytes = WIDTHBYTES(lWidth * 8);
for (i = 0; i < 256; i ++) // 重置计数为0
{
lCount[i] = 0;
}
for (i = 0; i < lHeight; i ++)// 计算各个灰度值的计数
{
for (j = 0; j < lWidth; j ++)
{
pSrc = srctmp + lLineBytes * i + j;
lCount[*(pSrc)]++;
}
}
for (i = 0; i < 256; i++)// 计算灰度映射表
{
lTemp = 0; // 初始为0
for (j = 0; j <= i ; j++)
{
lTemp += lCount[j];
}
bMap[i] = (BYTE) (lTemp * 255 / lHeight / lWidth); // 计算对应的新灰度值
}
for(i = 0; i < lHeight; i++)// 每行
{
for(j = 0; j < lWidth; j++) // 每列
{
pSrc = srctmp + lLineBytes * (lHeight - 1 - i) + j; // 指向DIB第i行,第j个象素的指针
*pSrc = bMap[*pSrc]; // 计算新的灰度值
}
}
return true;
}
本人用CUDA优化后的代码如下:
#define THREAD_N 128
#define LOOP_N 8
global void hist(unsigned char* pSrc, int * pDst_tmp)
{
const unsigned long tid = threadIdx.x;
const unsigned long bid = blockIdx.x;
unsigned long offset = __umul24((__umul24(bid, THREAD_N) + tid), LOOP_N); // (bid * THREAD_N + tid) * LOOP_N
//shared unsigned char bMap[256];// 灰度映射表
volatile shared int lCount[256];// 灰度计数表
//bMap[tid]=bMap[tid+128]=0;
lCount[tid]=lCount[tid+128]=0;
pDst_tmp[bid256+tid]=pDst_tmp[bid256+tid + 128]=0;
__syncthreads();
// 每个线程块有THREAD_N(128)个线程,每个线程处理LOOP_N(8)个点,统计结果存储在每个线程块的lCount[256]中
for(int i = 0; i < LOOP_N; i ++) {
int p;
p =(int)pSrc[offset];
offset ++;
atomicAdd((int*)&lCount[p], 1);
}
__syncthreads();
// 线程块统计计算完成后,把结果从lCount[256]复制到global memory中
// 128字交替访存,以满足各线程的合并访问要求以及防止shared memory的bank conflict,提高效率
atomicAdd((int )&pDst_tmp[bid256+tid], lCount[tid]);
atomicAdd((int )&pDst_tmp[bid256+tid + 128], lCount[tid + 128]);
/pDst_tmp[bid<<8+tid] = lCount[tid];
pDst_tmp[bid<<8+tid + 128] =lCount[tid + 128];/
__syncthreads();
}
global void histCollectKernel(int indata, int blockcnt, unsigned char out,int lHeight,int lWidth )
{
int i;
const unsigned long tid = threadIdx.x;
unsigned long count = 0;
int outtmp[256]={0};
// 汇总亮度为tid的统计数据
for(i = 0; i < blockcnt; i ++)
count += indata[(i << 8) + tid];
outtmp[tid]=count;
int lTemp = 0; // 初始为0
for (int j = 0; j <= tid ; j++)
{
lTemp += outtmp[j];
}
out[tid] = (unsigned char) (lTemp * 255 / lHeight / lWidth); // 计算对应的新灰度值
}
global void InteEqualize(unsigned char temp, unsigned char * pSrc,int lWidth, int lHeight)
{
int x=blockDim.xblockIdx.x+threadIdx.x;
int y=blockDim.yblockIdx.y+threadIdx.y;
(pSrc+lWidth * (lHeight - 1 - y) + x)=temp[(pSrc+lWidth * (lHeight - 1 - y) + x)];
__syncthreads();
//(pSrc+lWidth * y + x)=temp[*(pSrc+lWidth *y + x)];
}
extern “C”
void run_InteEqualize(unsigned char* pSrc,unsigned int lWidth, int lHeight)
{
int tmp;
cudaMalloc((void*)&tmp,lWidth * lHeight / THREAD_N / LOOP_N256sizeof(int));
// 各线程块并行统计,每个线程块处理THREAD_NLOOP_N个点
hist<<<lWidth * lHeight / THREAD_N / LOOP_N, THREAD_N>>>(pSrc, tmp);
// 汇总各线程块的统计结果,256个线程,每个亮度的汇总占一个线程
unsigned char tmp1;
cudaMalloc((void*)&tmp1,256sizeof(unsigned char));
histCollectKernel<<<1, 256>>>(tmp, lWidth * lHeight / THREAD_N / LOOP_N, tmp1,lWidth,lHeight);
dim3 dimGrid(lWidth/128, lHeight/8,1);
dim3 dimBlock(128,8,1);
// execute the kernel
InteEqualize<<<dimGrid,dimBlock>>>(tmp1,pSrc,lWidth,lHeight);
cudaFree(tmp);
cudaFree(tmp1);
}
CUDA运行的结果跟C运行的结果是不样的,模糊了很多,请问我的程序那离有问题那?找了好久找不到啊。