做直方图均衡的时候老是报错

system · 2013 年7 月 27 日 04:24

本人在做1920x1080图像的直方图均衡，代码如下：
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_math.h>
#include <float.h>
#include <highgui.h>
#include <cv.h>

#define THREAD_N 128
#define LOOP_N 8

global void hist(unsigned char* pSrc, unsigned char * pDst_tmp)
{
const unsigned long tid = threadIdx.x;
const unsigned long bid = blockIdx.x;
unsigned long offset = __umul24((__umul24(bid, THREAD_N) + tid), LOOP_N); // (bid * THREAD_N + tid) * LOOP_N

//shared unsigned char bMap[256];// 灰度映射表
shared unsigned char lCount[256];// 灰度计数表

//bMap[tid]=bMap[tid+128]=0;
lCount[tid]=lCount[tid+128]=0;
__syncthreads();
// 每个线程块有THREAD_N(128)个线程，每个线程处理LOOP_N(8)个点，统计结果存储在每个线程块的lCount[256]中
for(int i = 0; i < LOOP_N; i ++) {
unsigned char p;
p = pSrc[offset];
offset ++;
atomicAdd((int *)&lCount[p], 1);
}
__syncthreads();
// 线程块统计计算完成后，把结果从lCount[256]复制到global memory中
// 128字交替访存，以满足各线程的合并访问要求以及防止shared memory的bank conflict，提高效率
/atomicAdd((int )&pDst_tmp[tid], lCount[tid]);
atomicAdd((int )&pDst_tmp[tid + 128], lCount[tid + 128]);/
pDst_tmp[bid256+tid] = lCount[tid];
pDst_tmp[bid256+tid + 128] = lCount[tid + 128];

}
global void histCollectKernel(unsigned char *in, int blockcnt, unsigned char out){
int i;
const unsigned long tid = threadIdx.x;
unsigned long count = 0;
// 汇总亮度为tid的统计数据
for(i = 0; i < blockcnt; i ++)
count += in[(i << 8) + tid];
out[tid] = count;
}
device void computeMap(unsigned char * in, unsigned char * out,int lWidth, int lHeight)
{
for (int i = 0; i < 256; i++)// 计算灰度映射表
{
int lTemp = 0; // 初始为0
for (int j = 0; j <= i ; j++)
{
lTemp += in[j];
}
out[i] = (unsigned char) (lTemp * 255 / lHeight / lWidth); // 计算对应的新灰度值
}
}
global void InteEqualize(unsigned char temp, unsigned char * pSrc,int lWidth, int lHeight)
{
int x=blockDim.xblockIdx.x+threadIdx.x;
int y=blockDim.yblockIdx.y+threadIdx.y;
unsigned char temp2[256];
computeMap(temp,temp2,lWidth,lHeight);
pSrc=pSrc+lWidth * (lHeight - 1 - y) + x; // 指向DIB第i行，第j个象素的指针
*pSrc = temp2[*pSrc]; // 计算新的灰度值
}

extern “C”
void run_InteEqualize(unsigned char* pSrc,unsigned int lWidth, int lHeight)
{
unsigned char tmp[256];
// 各线程块并行统计，每个线程块处理THREAD_N*LOOP_N个点
hist<<<lWidth * lHeight / THREAD_N / LOOP_N, THREAD_N>>>(pSrc, tmp);
// 汇总各线程块的统计结果，256个线程，每个亮度的汇总占一个线程
unsigned char tmp1[256];
histCollectKernel<<<1, 256>>>(tmp, lWidth * lHeight / THREAD_N / LOOP_N, tmp1);

dim3 dimGrid(lWidth/128, lHeight/8,1);
dim3 dimBlock(128,8,1);
// execute the kernel
InteEqualize<<<dimGrid,dimBlock>>>(tmp1,pSrc,lWidth,lHeight);
}
在用Nisght调试的时候老是出现下面的错误：
CUDA Memory Checker detected 67 threads caused an access violation:
Launch Parameters
CUcontext = 0ad28978
CUstream = 0e4d7d30
CUmodule = 1028b1f0
CUfunction = 16a8f088
FunctionName = Z4histPhS
GridId = 43
gridDim = {2025,1,1}
blockDim = {128,1,1}
sharedSize = 256
Parameters:
pSrc = 0x060a0000 0 ’
Memory Checker detected 67 access violations.
error = misaligned load (shared memory)
gridid = 43
blockIdx = {0,0,0}
threadIdx = {4,0,0}
address = 0x00000062
accessSize = 4
错误停留在：
static forceinline
int __iAtomicAdd(int *p, int val)
{
return __nvvm_atom_add_gen_i((volatile int *)p, val);
}
就是那个原子操作是出问题了，但是哪里出问题了那？我找不到啊，求高手指点。

system · 2013 年7 月 27 日 04:56

LZ您好：

根据您提供的调试信息，错误为：

Memory Checker detected 67 access violations.
error = misaligned load (shared memory)

您出错的代码应该为：
atomicAdd((int *)&lCount[p], 1);

同是注意到您定义的shared memory数组为 shared unsigned char lCount[256];

您定义的数组为char数组，每个元素是1B的，读写char类型的变量，对齐要求也为1B。
您在使用atomicAdd的时候，为了照顾参数类型，进行了类型转换，但此时&lCount[p]这个地址只满足1B对齐，这不符合int类型读写要求的4B对齐，所以“error = misaligned load (shared memory)”。

我觉得您可以考虑改用int类型的lCount数组，请您在您的算法框架下考虑这一建议是否可行。

大致如此，祝您编码顺利~

system · 2013 年7 月 27 日 08:50

谢谢版主的详细解答，这个问题顺利解决，但是新的问题又来了。
pDst_tmp[bid256+tid] = lCount[tid];
pDst_tmp[bid256+tid + 128] =lCount[tid + 128];
这两句话也报错，是想把share memory中的数据拷贝到global 里，报错如下：
Memory Checker detected 256 access violations.
error = access violation on store (global memory)
gridid = 43
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0x0da0f154
accessSize = 1

Stepper Failed: Trying to step invalid warp.
Stepper Failed: Trying to step invalid warp.
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: Z4histPhS
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: Z4histPhS
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: Z17histCollectKernelPhiS
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: Z4histPhS
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: Z17histCollectKernelPhiS
CUDA grid launch failed: CUcontext: 177506600 CUmodule: 286913464 Function: _Z12InteEqualizePhS_ii
A CUDA context was created using API version 3010. Debugging requires API version 3020. Update the attached process to use a newer version of the CUDA API.
其中pDst_tmp我从原来的unsigned char 改成了int，为了跟share memoney 的类型保持一致。我改变思路，用原子操作：
atomicAdd((int )&pDst_tmp[bid256+tid], lCount[tid]);
atomicAdd((int )&pDst_tmp[bid256+tid + 128], lCount[tid + 128]);
其中pDst_tmp和lCount都是int类型，报错如下：
CUDA Memory Checker detected 32 threads caused an access violation:
Launch Parameters
CUcontext = 00e88920
CUstream = 0fdd7d20
CUmodule = 177f6640
CUfunction = 178a82e0
FunctionName = _Z4histPhPi
GridId = 43
gridDim = {2025,1,1}
blockDim = {128,1,1}
sharedSize = 1024
Parameters:
pSrc = 0x060a0000 0 ’
Memory Checker detected 32 access violations.
error = access violation on atomic (global memory)
gridid = 43
blockIdx = {0,0,0}
threadIdx = {0,0,0}
address = 0x0d81ee8c
accessSize = 4
错误又停留在了
static forceinline
int __iAtomicAdd(int *p, int val)
{
return __nvvm_atom_add_gen_i((volatile int *)p, val);
}
说明原子操作又错了，怎么回事那？我都改成了int了啊，怎么实现share mem 到global 的拷贝那？

system · 2013 年7 月 27 日 11:14

LZ您好：

1：根据您提供的报错信息：一次是Memory Checker detected 256 access violations.
error = access violation on store (global memory)，一次是Memory Checker detected 32 access violations.error = access violation on atomic (global memory)。这基本上说明您是访存越界了。第一次是存储，第二次是原子操作。
请您仔细检查pDst_tmp的定义，分配空间和使用时的指针情况。

2：要实现shared memory到global memory的copy，其实和其他copy相比并无特别之处。如果是写往global memory的不同位置，那么直接赋值即可。如果是累加等写往同一位置的操作，需要使用原子操作。
您上述出错不在于使用了原子操作，而是您指针跑飞了。您可以检查一下给pDst_tmp分配空间的时候，写的字节数是否乘上了sizeof(int)。以及您在使用的时候，各个指针指向情况如何。

祝您调试顺利~