cuda 找大素数

CPU端程序:
[attach]2839[/attach]
[attach]2840[/attach]
---------------------------------------------------------------分割线--------------------------------------------------------------------------------
Cuda:
当输入的素数上限较小时,程序输出和CPU端一样,当上限超过10000时,输出的素数总数大于CPU端的输出数。
有一些以5结尾的数也被输出来了。而且跑两遍程序得到的素数总数居然不一样。Debug两天了也没搞出来。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>
#define FALSE (0)
#define TRUE (~FALSE)
#define BLOCKNUM 64
#define THREADNUM 256

global void generatePrimes1( int *a, int *b )
{
int tid = blockIdx.x * BLOCKNUM + threadIdx.x;

int x, y, z;

int upper;
int limit;

x = tid + 1;

upper = *b;
limit = ceil( sqrt( 1.0 * upper ) );

while( x <= limit)
{		
for( y = 1; y <= limit; y++)
	{
		z = 4 * x * x + y * y;
		if( ( z <= upper ) && ( ( z % 12 == 1 ) || ( z % 12 == 5 ) ) )
			a[ z ] = ~a[ z ]; 
		z = 3 * x * x + y * y;
		if ( ( z <= upper ) && ( z % 12 == 7 ) )
			a[ z ] = ~a[ z ];
		z = ( 3 * x * x ) - ( y * y );
		if ( ( x > y ) && ( z <= upper ) && ( z % 12 == 11 ) )
			a[ z ] = ~a[ z ];
	}
__syncthreads();
x += blockDim.x * gridDim.x;
}

}

global void generatePrimes2( int *a, int *b )
{
int tid = blockIdx.x * BLOCKNUM + threadIdx.x;

int upper, limit;
int m,n;

upper = *b;
limit = ceil( sqrt( 1.0 * upper ) );

m = tid + 5;

while( m <= limit )
{
	if( a[ m ] )
	{
		for( n = m * m; n <= upper; n += m * m)
		{
			a[ n ] = FALSE;
		}
	}
m += blockDim.x * gridDim.x;
}

}

int main( void )
{
int upper;
int *cpuTemp;
int *cpuResult;
int i;

int *result;
int *temp;

int count;
int total = 0;//记录有多少个素数	

printf( "Please input an upper bound(integer):\n" );
scanf( "%d", &upper );
getchar();

cpuTemp = &upper;

//FALSE indicates the number is a composite
cpuResult = (int *) malloc( 1 + sizeof( int ) * upper );
for( i = 0; i <= upper; i++)
	cpuResult[ i ] = FALSE;



cudaMalloc( ( void ** )&result, ( upper + 1 ) * sizeof( int ) );
cudaMalloc( ( void ** )&temp, sizeof( int ) );


cudaMemcpy( result, cpuResult, 1 + sizeof( int ) * upper, cudaMemcpyHostToDevice );
cudaMemcpy( temp, cpuTemp, sizeof( int ), cudaMemcpyHostToDevice );


generatePrimes1<<< BLOCKNUM, THREADNUM >>>( result, temp );

generatePrimes2<<< BLOCKNUM, THREADNUM >>>( result, temp );


cudaMemcpy( cpuResult, result, 1 + sizeof( int ) * upper, cudaMemcpyDeviceToHost );

cudaFree( result );
cudaFree( temp );

cpuResult[ 2 ] = TRUE;
cpuResult[ 3 ] = TRUE;

//输出质数
for( count = 2; count <= upper; count++)
{
	if( cpuResult[ count ] )
	{
		total ++;
		printf( "%d\t", count );
	}
}


printf( "\nThe total number of primes in the range is %d\n", total );
puts( "" );
system( "pause" );
return 0;

}
急求帮助!万分感谢!

抱歉,
CPU端程序如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define FALSE (0)
#define TRUE (~FALSE)

int main( void )
{
long upper;
long limit;
long *result, i;
long x, y, z;
long m, n;
int count;
int total = 0;//记录有多少个素数

//user input
printf( “Please input an upper bound(integer):\n” );
scanf( “%ld”, &upper );
getchar();

//FALSE indicates the number is a composite
result = (long *) malloc( 1 + sizeof( long ) * upper );
for( i = 0; i <= upper; i++)
result[ i ] = FALSE;

limit = ceil( sqrt( 1.0 * upper ) );//double->int???

for( x = 1; x <= limit; x++)
{
for( y = 1; y <= limit; y++)
{
z = 4 * x * x + y * y;
if( ( z <= upper ) && ( ( z % 12 == 1 ) || ( z % 12 == 5 ) ) )
result[ z ] = ~result[ z ]; //TRUE表示此数为素数
z = 3 * x * x + y * y;
if ( ( z <= upper ) && ( z % 12 == 7 ) )
result[ z ] = ~result[ z ];
z = ( 3 * x * x ) - ( y * y );
if ( ( x > y ) && ( z <= upper ) && ( z % 12 == 11 ) )
result[ z ] = ~result[ z ];
}
}

for( m = 5; m <= limit; m++)
{
if( result[ m ] )
{
for( n = m * m; n <= upper; n += m * m)
{
result[ n ] = FALSE;
}
}
}

if( upper >= 2 )
{
result[ 2 ] = TRUE;
result[ 3 ] = TRUE;
}

//输出质数
for( count = 2 ; count <= upper; count++)
{
if( result[ count ] )
printf( “%d\t”, count );
}

puts( “” );
system( “pause” );
return 0;
}