CPU端程序:
[attach]2839[/attach]
[attach]2840[/attach]
---------------------------------------------------------------分割线--------------------------------------------------------------------------------
Cuda:
当输入的素数上限较小时,程序输出和CPU端一样,当上限超过10000时,输出的素数总数大于CPU端的输出数。
有一些以5结尾的数也被输出来了。而且跑两遍程序得到的素数总数居然不一样。Debug两天了也没搞出来。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>
#define FALSE (0)
#define TRUE (~FALSE)
#define BLOCKNUM 64
#define THREADNUM 256
global void generatePrimes1( int *a, int *b )
{
int tid = blockIdx.x * BLOCKNUM + threadIdx.x;
int x, y, z;
int upper;
int limit;
x = tid + 1;
upper = *b;
limit = ceil( sqrt( 1.0 * upper ) );
while( x <= limit)
{
for( y = 1; y <= limit; y++)
{
z = 4 * x * x + y * y;
if( ( z <= upper ) && ( ( z % 12 == 1 ) || ( z % 12 == 5 ) ) )
a[ z ] = ~a[ z ];
z = 3 * x * x + y * y;
if ( ( z <= upper ) && ( z % 12 == 7 ) )
a[ z ] = ~a[ z ];
z = ( 3 * x * x ) - ( y * y );
if ( ( x > y ) && ( z <= upper ) && ( z % 12 == 11 ) )
a[ z ] = ~a[ z ];
}
__syncthreads();
x += blockDim.x * gridDim.x;
}
}
global void generatePrimes2( int *a, int *b )
{
int tid = blockIdx.x * BLOCKNUM + threadIdx.x;
int upper, limit;
int m,n;
upper = *b;
limit = ceil( sqrt( 1.0 * upper ) );
m = tid + 5;
while( m <= limit )
{
if( a[ m ] )
{
for( n = m * m; n <= upper; n += m * m)
{
a[ n ] = FALSE;
}
}
m += blockDim.x * gridDim.x;
}
}
int main( void )
{
int upper;
int *cpuTemp;
int *cpuResult;
int i;
int *result;
int *temp;
int count;
int total = 0;//记录有多少个素数
printf( "Please input an upper bound(integer):\n" );
scanf( "%d", &upper );
getchar();
cpuTemp = &upper;
//FALSE indicates the number is a composite
cpuResult = (int *) malloc( 1 + sizeof( int ) * upper );
for( i = 0; i <= upper; i++)
cpuResult[ i ] = FALSE;
cudaMalloc( ( void ** )&result, ( upper + 1 ) * sizeof( int ) );
cudaMalloc( ( void ** )&temp, sizeof( int ) );
cudaMemcpy( result, cpuResult, 1 + sizeof( int ) * upper, cudaMemcpyHostToDevice );
cudaMemcpy( temp, cpuTemp, sizeof( int ), cudaMemcpyHostToDevice );
generatePrimes1<<< BLOCKNUM, THREADNUM >>>( result, temp );
generatePrimes2<<< BLOCKNUM, THREADNUM >>>( result, temp );
cudaMemcpy( cpuResult, result, 1 + sizeof( int ) * upper, cudaMemcpyDeviceToHost );
cudaFree( result );
cudaFree( temp );
cpuResult[ 2 ] = TRUE;
cpuResult[ 3 ] = TRUE;
//输出质数
for( count = 2; count <= upper; count++)
{
if( cpuResult[ count ] )
{
total ++;
printf( "%d\t", count );
}
}
printf( "\nThe total number of primes in the range is %d\n", total );
puts( "" );
system( "pause" );
return 0;
}
急求帮助!万分感谢!