Emulation 结果和release 结果不一样(附hash function 源码)

有哪位高手可以帮忙看看, 不胜感激

I am trying to modify MurmurHash2 hash function to use GPU, the result is correct in emulation mode. But it is wrong when I run in release mode.
Seems it is related with thread concurrency. Currently I just use 1 block and 256 threads, it is very simple.

输入:asldflsdasldflsdfsaldkjfslkj8i22sadljfl-90jjjjlk-0js24askl
输出结果,emulation是正确的,release 就不对了
Value of emulation: Value of release:
3436920429 3436920429
3004684928 3436920429
2677509444 3436920429
3664452983 3436920429
3183232068 3183232068
913038785 3183232068
3039185496 3183232068

//Original MurmurHash2 is to calculate hash value of a string, which is invented by MIT
//The function is to calculate hash value of sub strings in string p_date and then save them in array h
//plen: length of a sub string
// *totalLen is the length is the string p_data

global static void MurmurHash2( const char * p_data, int* plen, int* totalLen, unsigned int *h){

//int index = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int m = 0x5bd1e995;
const int r = 24;

// Initialize the hash to a 'random' value
unsigned int seed = 0;

// Mix 4 bytes at a time into the hash

int i,index,len;
const int tid = threadIdx.x;
int debIdex=tid*15;
for(i = tid; i < *totalLen - (*plen-1); i += THREAD_NUM ) {

index = i;
len =*plen;

h[index] = seed ^ len;

const unsigned char * data = (const unsigned char *)&p_data[i];

	while(len >= 4)
	{
		unsigned int k = *(unsigned int *) data;

		k *= m; 
		k ^= k >> r; 
		k *= m; 
		
		h[index] *= m;  // emulation 模式是正确的,release 的错误从这里开始发生
		h[index] ^= k;
		
		data += 4;
		len -= 4;
	}
	
	
	// Handle the last few bytes of the input array

	switch(len)
	{
	case 3: (h[index]) ^=  data[2] << 16;
	case 2: (h[index]) ^=  data[1] << 8;
	case 1: (h[index]) ^=  data[0];
	        (h[index]) *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.

	h[index] ^= h[index] >> 13;
	h[index] *= m;
	h[index] ^= h[index] >> 15;

}

}