下面的程序,为什么只能做16倍数的矩阵?谢谢
global static void mat_MultCUDA_2(const float *a,int aRow,int aCol,
const float *b,int bRow,int bCol,float *C)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int nResult = (bx*blockDim.x+tx)aRow+byblockDim.y+ty;
shared float as[BLOCK_SIZE][BLOCK_SIZE];
shared float bs[BLOCK_SIZE][BLOCK_SIZE];
float result = 0.0;
int aBegin = byBLOCK_SIZE;
int aStep = BLOCK_SIZEaRow;
int bBegin = bxBLOCK_SIZEbRow;
int bStep = BLOCK_SIZE;
int bEnd = bBegin+bRow-1;
for (int i=aBegin,j=bBegin;j<=bEnd;i+=aStep,j+=bStep)
{
if(tx+bxBLOCK_SIZE<aCol && ty+byBLOCK_SIZE<aRow)
{
as[ty][tx] = a[i+aRow*tx+ty];
}
else
{
as[ty][tx] = 0;
}
if(tx+bxBLOCK_SIZE<bCol && ty+byBLOCK_SIZE<bRow)
{
bs[ty][tx] = b[j+bRowtx+ty];
}
else
bs[ty][tx]=0;
/
as[ty][tx] = a[i + aRow * tx + ty];
bs[ty][tx] = b[j + bRow * tx + ty];*/
__syncthreads();
//if(nResult<aRow*bCol)
for (int k = 0; k < BLOCK_SIZE; ++k)
result += as[ty][k] * bs[k][tx];
__syncthreads();
}
C[nResult] = result;
}
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid((M+BLOCK_SIZE-1)/BLOCK_SIZE,(L+BLOCK_SIZE-1)/BLOCK_SIZE);
mat_MultCUDA_2<<<dimGrid,dimBlock>>>(d_a,L,N,d_b,N,M,d_result);