矩阵乘法出错

大家好,新人求助:
我按《深入浅出CUDA》一文中最简单的方法实现了矩阵乘法,发现一个问题:如果矩阵规模小的话,比如两个500500的矩阵相乘,结果没问题,如果规模大一点,比如两个10001000的矩阵相乘,结果就会出错,结果矩阵中刚开始的元素全是0,而后面的元素全是-431602080.000000
不知道大家遇到过这个问题没?怎么解决呢?我的代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

#define NUM_THREADS 256

bool InitCUDA(void)
{
int count = 0;
int i = 0;

cudaGetDeviceCount(&count);
if(count == 0) {
	fprintf(stderr, "无支持设备.\n");
	return false;
}

for(i = 0; i < count; i++) {
	cudaDeviceProp prop;
	if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
		if(prop.major >= 1) {
			break;
		}
	}
}
if(i == count) {
	fprintf(stderr, "无设备支持CUDA.\n");
	return false;
}
cudaSetDevice(i);

printf("CUDA初始化成功.\n");
return true;

}

global static void KernelMatrixMultiply(const float* a, int a_width, int a_height, const float* b, int b_width, int b_height, float* c)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int idx = bid * blockDim.x + tid;
const int row = idx / b_width;
const int column = idx % b_width;
int i;

if(row < a_height && column < b_width) {
	float t = 0;
	for(i = 0; i < a_width; i++) {
		t += a[row * a_width + i] * b[i * b_width + column];
	}
	c[row * b_width + column] = t;
}

}
void GenerateMatrix(float* a, int width, int height)
{
int i, j;

for(i = 0; i < height; i++) {
	for(j = 0; j < width; j++) {
					a[i * width + j] = 1.2;
	}
}

}

void MatrixMultiply(const float* a, int a_width, int a_height,const float* b, int b_width,int b_height, float* c)
{
int i, j, k;

for(i = 0; i < a_height; i++) {
	for(j = 0; j < b_width; j++) {
		float t = 0;
		for(k = 0; k < a_width; k++) {
			t += a[i * a_width + k] * b[k * b_width + j];
		}
		c[i * a_width + j] = t;
	}
}

}

void CompareMatrix(const float* a, int width, int height, const float* b)
{
float max_err = 0;
float average_err = 0;
int i, j;

for(i = 0; i < height; i++) {
	for(j = 0; j < width; j++) {
		if(b[i * width + j] != 0) {
			float err = fabs((a[i * width + j] - b[i * width + j]) / b[i * width + j]);
			if(max_err < err) max_err = err;
			average_err += err;
		}
	}
}

printf("最大误差: %g 平均误差: %g\n", max_err, average_err / (width * height));

}

clock_t MatrixMultiplyByCUDA(const float* a, int a_width, int a_height, const float* b, int b_width, int b_height, float* c)
{
float *ac, *bc, *cc;
clock_t start, end;

start = clock();
cudaMalloc((void**) &ac, sizeof(float) * a_height * a_width);
cudaMalloc((void**) &bc, sizeof(float) * b_height * b_width);
cudaMalloc((void**) &cc, sizeof(float) * a_height * b_width);

cudaMemcpy2D(ac, sizeof(float) * a_width, a, sizeof(float) * a_width, sizeof(float) * a_width, a_height, cudaMemcpyHostToDevice);
cudaMemcpy2D(bc, sizeof(float) * b_width, b, sizeof(float) * b_width, sizeof(float) * b_width, b_height, cudaMemcpyHostToDevice);

int blocks = (b_width + NUM_THREADS - 1) / NUM_THREADS;
KernelMatrixMultiply<<<blocks * a_height, NUM_THREADS>>>(ac, a_width, a_height, bc, b_width, b_height, cc);

cudaMemcpy2D(c, sizeof(float) * b_width, cc, sizeof(float) * b_width,sizeof(float) * b_width, b_width, cudaMemcpyDeviceToHost);

cudaFree(ac);
cudaFree(bc);
cudaFree(cc);

end = clock();

return end - start;

}

int main()
{
float *a, *b, *c, *d;
int n = 1000;
int i;
if(!InitCUDA()) return 0;

a = (float*) malloc(sizeof(float) * n * n);
b = (float*) malloc(sizeof(float) * n * n);
c = (float*) malloc(sizeof(float) * n * n);
d = (float*) malloc(sizeof(float) * n * n);

srand(0);

GenerateMatrix(a, n, n);
GenerateMatrix(b, n, n);

clock_t time = MatrixMultiplyByCUDA(a,n,n,b,n,n,c);

double sec = (double) time / CLOCKS_PER_SEC;
printf("GPU计算耗时: %.2f秒.\n", sec, 2.0 * n * n * n / (sec * 1E9));
clock_t start = clock();
MatrixMultiply(a,n,n,b,n,n,d);
clock_t end = clock();
printf("CPU计算耗时: %.2f秒.\n", (double)(end-start)/CLOCKS_PER_SEC);
CompareMatrix(c,n,n,d);
int gap = n * n / 10;
for(i = 0; i< n * n; i += gap)
	printf("a[%d]=%f,b[%d]=%f,c[%d]=%f,d[%d]=%f\n", i,a[i],i,b[i],i,c[i],i,d[i]);
free(a);
free(b);
free(c);
free(d);
return 0;

}

运行结果如下(正确的结果应该是数组c和数组d的对应项相等):
CUDA初始化成功.
GPU计算耗时: 13.42秒.
CPU计算耗时: 16.75秒.
最大误差: 299728 平均误差: 220491
a[0]=1.200000,b[0]=1.200000,c[0]=0.000000,d[0]=1439.984619
a[100000]=1.200000,b[100000]=1.200000,c[100000]=0.000000,d[100000]=1439.984619
a[200000]=1.200000,b[200000]=1.200000,c[200000]=0.000000,d[200000]=1439.984619
a[300000]=1.200000,b[300000]=1.200000,c[300000]=-431602080.000000,d[300000]=1439.984619
a[400000]=1.200000,b[400000]=1.200000,c[400000]=-431602080.000000,d[400000]=1439.984619
a[500000]=1.200000,b[500000]=1.200000,c[500000]=-431602080.000000,d[500000]=1439.984619
a[600000]=1.200000,b[600000]=1.200000,c[600000]=-431602080.000000,d[600000]=1439.984619
a[700000]=1.200000,b[700000]=1.200000,c[700000]=-431602080.000000,d[700000]=1439.984619
a[800000]=1.200000,b[800000]=1.200000,c[800000]=-431602080.000000,d[800000]=1439.984619
a[900000]=1.200000,b[900000]=1.200000,c[900000]=-431602080.000000,d[900000]=1439.984619

为什么会出现这个问题呢?我的机器配置如下:
CPU:DualCore Intel Pentium D 930, 3000 MHz (15 x 200)
内存:1G
显卡:NVIDIA GeForce 9300 GE 显存512MB

[ 本帖最后由 jimmy19850511 于 2010-3-15 20:56 编辑 ]