2048*2048的矩阵相乘运行不出结果

为什么我的GPU运行不出结果
我用的是GeForce 9400 GT
block是16*16

你把程序贴出看看吧

include <cutil_inline.h>
define BLOCK_SIZE 16
define DATA_SIZE 512
global void Kernel(float Ad, float Bd, floatCd)
{
int bx=blockIdx.x;
int by=blockIdx.y;
int tx=threadIdx.x;
int ty=threadIdx.y;
int aBegin=DATA_SIZE
BLOCK_SIZEby;
int aEnd=aBegin+DATA_SIZE-1;
int bBegin=BLOCK_SIZE
bx;
float sub=0;
for(int a=aBegin,b=bBegin;a<=aEnd;a+=BLOCK_SIZE,b+=BLOCK_SIZEDATA_SIZE)
{
shared float A[BLOCK_SIZE][BLOCK_SIZE];
shared float B[BLOCK_SIZE][BLOCK_SIZE];
if(a+tx<aBegin+DATA_SIZE && by
BLOCK_SIZE+ty<DATA_SIZE) //对A,B的限制是不同的
{
A[tx][ty]=Ad[a+DATA_SIZE*ty+tx];
}
else
{
A[tx][ty]=0;

}
if( b+tyDATA_SIZE<bBegin+DATA_SIZEDATA_SIZE && bx*BLOCK_SIZE+tx<DATA_SIZE )
{
B[tx][ty]=Bd[b+DATA_SIZE*ty+tx];
}
else
{
B[tx][ty]=0;

}
__syncthreads();

for(int k=0;k<BLOCK_SIZE;k++)
{
sub+=A[k][ty]*B[tx][k];
}
__syncthreads();

}
if(bxBLOCK_SIZE+tx<DATA_SIZE && byBLOCK_SIZE+ty<DATA_SIZE)
{
int c=DATA_SIZEBLOCK_SIZEby+BLOCK_SIZEbx;
Cd[c+DATA_SIZE
ty+tx]=sub;

}