关于矩阵乘报错,求助各位大侠

大家好,新人求助:
我刚接触CUDA,自己按照《高性能运算之CUDA》,编了一个最简单的矩阵乘法,发现两个个问题:
1.如果矩阵规模小的话,比如两个500500的矩阵相乘,结果没问题,但是把block分块改成不是32的倍数,比如(30,4)的话,就会显卡报错黑屏一下。
2.如果block选择(16,16)分块,规模小没问题,规模大一点,比如两个2000
2000的矩阵相乘,也会出现显卡报错 黑屏。还试了960960,970970,980980,990990,10001000,10241024都没问题,但是试了不是10的倍数的如999999,还有上1000的10201020也会出错,也是显卡报错,黑屏一下在跳出来。不知道是不是代码写的有问题。大家遇到过这种问题没有?

我用的是dell的工作站,显卡是FX1800。

代码如下(为了方便,选择的都是方阵,直接写的数字。)
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>

//#include <omp.h>
//#define BLOCK_SIZE 16;

global void MatMulKernel(float* A,float* B,float*C)
{
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int A_width = 999;
unsigned int B_width = 999;
unsigned int C_width = 999;

for(int e = 0;e < A_width; ++e)
{
if((row < 999) && (col <999))
Cvalue += A[row * A_width + e] * B[e * B_width + col];
}
C[row * C_width + col] = Cvalue;
__syncthreads();
}

void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
//data[i] = rand() / (float)RAND_MAX;
data = rand() % 9;
}

[/i]int main(int argc, char* argv)
{
//set Matrix size
unsigned int A_width = 999;
unsigned int A_height = 999;
unsigned int B_width = 999;
unsigned int B_height = 999;
unsigned int C_width = 999;
unsigned int C_height = 999;

unsigned int size_A = A_width * A_height;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);

unsigned int size_B = B_width * B_height;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);

// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

for(int i=0;i<10;i++)
{
for(int j = 0; j<10; j++)
printf(“%6.2f “,h_A[A_width * i + j]);
printf(”\n”);
}
printf(“\n”);

for(int i=0;i<10;i++)
{
for(int j = 0; j<10; j++)
printf(“%6.2f “,h_B[B_width * i + j]);
printf(”\n”);
}
printf(“\n”);

// allocate device memory
float* d_A;
cutilSafeCall(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
cutilSafeCall(cudaMalloc((void**) &d_B, mem_size_B));

// copy host memory to device
cutilSafeCall(cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice));

// allocate device memory for result
unsigned int size_C = C_width * C_height;
unsigned int mem_size_C = sizeof(float) * size_C;
float* d_C;
cutilSafeCall(cudaMalloc((void**) &d_C, mem_size_C));

// allocate host memory for the result
float* h_C = (float*) malloc(mem_size_C);

// setup execution parameters
dim3 dimBlock(16,16);
dim3 dimGrid((999+dimBlock.x-1)/dimBlock.x,(999+dimBlock.y-1)/dimBlock.y);

unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));

// kernel
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
cudaThreadSynchronize();

//copy device memory to host
cutilSafeCall(cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost));

cutilCheckError( cutStopTimer( timer));
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));

for(int i=999-10;i<999;i++)
{
for(int j =999-10; j<999; j++)
printf(“%6.2f “,h_C[999 * i + j]);
printf(”\n”);
}
printf(“\n”);

//cpu compute
timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));

unsigned int size_D = C_width * C_height;
unsigned int mem_size_D = sizeof(float) * size_D;
float* D = (float*) malloc(mem_size_D);
cutilCheckError( cutStartTimer( timer));

float Dvalue = 0;

//#pragma omp parallel for
for(int i = 0; i < 999 ; i++)
{

for(int j = 0; j < 999; j++)
{
Dvalue = 0;
for(int k =0; k < 999 ;k++){
Dvalue +=h_A[i *999 + k] * h_B[k * 999 + j];}
D[i*999+j] = Dvalue;
}
}

cutilCheckError( cutStopTimer( timer));
printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));

for(int i=999-10;i<999;i++)
{
for(int j = 999-10; j<999; j++)
printf(“%6.2f “,D[999 * i + j]);
printf(”\n”);
}

free(h_A);
free(h_B);
free(h_C);
free(D);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

CUT_EXIT(argc,argv);
return 0;
}