我在和函数中使用了数组锁,参照的是cuda by example中的最后的散列表的实现。最后用到了cudaThreadSynchronize()。但在运行中HANDLE_ERROR(cudaThreadSynchronize())提示我有未知错误。请问,有可能的问题是什么呢?
一个简单的代码如下:
struct Lock
{
int *mutex;
Lock()
{
int state = 0;
cudaMalloc(&mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock()
{
cudaFree(mutex);
}
device void lock()
{
while(atomicCAS(mutex, 0, 1) != 0);
}
device void unlock()
{
atomicExch(mutex, 0);
}
};
global void test(int *t, int size, Lock *lock)
{
int tid = threadIdx.x;
for(int i = 0; i < 32; i++)
if ((tid % 32) == i)
{
lock[tid].lock();
t[tid] = tid;
lock[tid].unlock();
}
}
int main()
{
Lock *lock = (Lock *)malloc(sizeof(Lock) * 10);
Lock *dev_lock;
cudaMalloc(&dev_lock, sizeof(Lock) * 10);
cudaMemcpy(dev_lock, lock, sizeof(Lock) * 10, cudaMemcpyHostToDevice);
int *t;
cudaMalloc(&t, sizeof(int) * 10);
test<<<1, 10>>>(t, 10, dev_lock);
cudaThreadSynchronize();
//HANDLE_ERROR(cudaThreadSynchronize());
int h_t[10];
cudaMemcpy(h_t, t, sizeof(int) * 10, cudaMemcpyDeviceToHost);
for(int i = 0; i < 10; i++)
cout << h_t[i] << endl;
return 0;
}
在最后的输出中,h_t所打印的为何并非所期望的