这是一个比较简单的练习程序,用来统计给定数组中每个值的个数,当用一个线程执行时结果正确,每个值的个数的和(sum)等于值的总个数(DATA_SIZE)。但是,当用多个线程执行时,最终统计的值个数小于值的总个数。帮忙看看程序如何改进能在多线程时得出正确的结果,谢谢!
C/C++ code
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#define DATA_SIZE 1048576
#define DATA_MAX 16 //输入数据值的范围
#define BLOCK_NUM 32 //block数量
#define THREAD_NUM 128 //thread 数量
int h_in[ DATA_SIZE ];
int h_out[ DATA_MAX ];
//
global void BlendTest( int* d_in, int* d_out);
//
//初始化数据
void InitData( int* data, int dataSize )
{
for( int i=0; i<dataSize; i++ )
{
data[i] = rand()%DATA_MAX;
}
}
int main()
{
InitData( h_in, DATA_SIZE );
int d_in, d_out;
cudaMalloc( (void) &d_in, sizeof(int) * DATA_SIZE);
cudaMalloc( (void**) &d_out, sizeof(int) * DATA_MAX);
cudaMemcpy( d_in, h_in, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);
BlendTest<<<BLOCK_NUM, THREAD_NUM, 0>>>( d_in, d_out );
cudaMemcpy( h_out, d_out, sizeof(int) * DATA_MAX, cudaMemcpyDeviceToHost);
int sum = 0;
for( int i=0; i<DATA_MAX; i++)
{
printf( “h_out[ %d ] = %d ;\r\n”, i, h_out[i] );
sum += h_out[i];
}
//sum的值应该等于DATA_SIZE,但是在多个线程时错误,小于DATA_SIZE
printf( " sum = %d ;\r\n", sum );
}
global void BlendTest( int* d_in, int* d_out)
{
int tid = threadIdx.x+blockIdx.x * THREAD_NUM;
for(int i = tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
{
d_out[ d_in[i] ] += 1; //计数输入的值
}
}