本人在做1920x1080图像的直方图均衡,代码如下:
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_math.h>
#include <float.h>
#include <highgui.h>
#include <cv.h>
#define THREAD_N 128
#define LOOP_N 8
global void hist(unsigned char* pSrc, unsigned char * pDst_tmp)
{
const unsigned long tid = threadIdx.x;
const unsigned long bid = blockIdx.x;
unsigned long offset = __umul24((__umul24(bid, THREAD_N) + tid), LOOP_N); // (bid * THREAD_N + tid) * LOOP_N
//shared unsigned char bMap[256];// 灰度映射表
shared unsigned char lCount[256];// 灰度计数表
//bMap[tid]=bMap[tid+128]=0;
lCount[tid]=lCount[tid+128]=0;
__syncthreads();
// 每个线程块有THREAD_N(128)个线程,每个线程处理LOOP_N(8)个点,统计结果存储在每个线程块的lCount[256]中
for(int i = 0; i < LOOP_N; i ++) {
unsigned char p;
p = pSrc[offset];
offset ++;
atomicAdd((int *)&lCount[p], 1);
}
__syncthreads();
// 线程块统计计算完成后,把结果从lCount[256]复制到global memory中
// 128字交替访存,以满足各线程的合并访问要求以及防止shared memory的bank conflict,提高效率
/atomicAdd((int )&pDst_tmp[tid], lCount[tid]);
atomicAdd((int )&pDst_tmp[tid + 128], lCount[tid + 128]);/
pDst_tmp[bid256+tid] = lCount[tid];
pDst_tmp[bid256+tid + 128] = lCount[tid + 128];
}
global void histCollectKernel(unsigned char *in, int blockcnt, unsigned char out){
int i;
const unsigned long tid = threadIdx.x;
unsigned long count = 0;
// 汇总亮度为tid的统计数据
for(i = 0; i < blockcnt; i ++)
count += in[(i << 8) + tid];
out[tid] = count;
}
device void computeMap(unsigned char * in, unsigned char * out,int lWidth, int lHeight)
{
for (int i = 0; i < 256; i++)// 计算灰度映射表
{
int lTemp = 0; // 初始为0
for (int j = 0; j <= i ; j++)
{
lTemp += in[j];
}
out[i] = (unsigned char) (lTemp * 255 / lHeight / lWidth); // 计算对应的新灰度值
}
}
global void InteEqualize(unsigned char temp, unsigned char * pSrc,int lWidth, int lHeight)
{
int x=blockDim.xblockIdx.x+threadIdx.x;
int y=blockDim.yblockIdx.y+threadIdx.y;
unsigned char temp2[256];
computeMap(temp,temp2,lWidth,lHeight);
pSrc=pSrc+lWidth * (lHeight - 1 - y) + x; // 指向DIB第i行,第j个象素的指针
*pSrc = temp2[*pSrc]; // 计算新的灰度值
}
extern “C”
void run_InteEqualize(unsigned char* pSrc,unsigned int lWidth, int lHeight)
{
unsigned char tmp[256];
// 各线程块并行统计,每个线程块处理THREAD_N*LOOP_N个点
hist<<<lWidth * lHeight / THREAD_N / LOOP_N, THREAD_N>>>(pSrc, tmp);
// 汇总各线程块的统计结果,256个线程,每个亮度的汇总占一个线程
unsigned char tmp1[256];
histCollectKernel<<<1, 256>>>(tmp, lWidth * lHeight / THREAD_N / LOOP_N, tmp1);
dim3 dimGrid(lWidth/128, lHeight/8,1);
dim3 dimBlock(128,8,1);
// execute the kernel
InteEqualize<<<dimGrid,dimBlock>>>(tmp1,pSrc,lWidth,lHeight);
}
在用Nisght调试的时候老是出现下面的错误:
CUDA Memory Checker detected 67 threads caused an access violation:
Launch Parameters
CUcontext = 0ad28978
CUstream = 0e4d7d30
CUmodule = 1028b1f0
CUfunction = 16a8f088
FunctionName = Z4histPhS
GridId = 43
gridDim = {2025,1,1}
blockDim = {128,1,1}
sharedSize = 256
Parameters:
pSrc = 0x060a0000 0 ’
Memory Checker detected 67 access violations.
error = misaligned load (shared memory)
gridid = 43
blockIdx = {0,0,0}
threadIdx = {4,0,0}
address = 0x00000062
accessSize = 4
错误停留在:
static forceinline
int __iAtomicAdd(int *p, int val)
{
return __nvvm_atom_add_gen_i((volatile int *)p, val);
}
就是那个原子操作是出问题了,但是哪里出问题了那?我找不到啊,求高手指点。