kenerl函数
global void matrixInv( float* A,float* B,int i,int wA)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
const int index_tx=bxblockDim.x+tx;
const int index_ty=byblockDim.y+ty;
/int index=index_ty+(index_tx)wA;/
const float temp=A[i*wA+i];
if(index_ty<wA&&index_tx<wA)
{
B[i*wA+index_tx]=B[i*wA+index_tx]/temp;
A[i*wA+index_tx]=A[i*wA+index_tx]/temp;}
float Avalue=0.0;
float Bvalue=0.0;
// C[index_tywA+index_tx]=A[index_tywA+i]A[i*n+index_tx];
// D[index_tywA+index_tx]=A[index_tywA+i]*B[i*n+index_tx];
shared float As[BLOCK_DIM][BLOCK_DIM];
shared float Bs[BLOCK_DIM][BLOCK_DIM];
shared float Cs[BLOCK_DIM][BLOCK_DIM];
shared float Ds[BLOCK_DIM][BLOCK_DIM];
shared float Ms[BLOCK_DIM][BLOCK_DIM];
if(index_ty<wA&&index_tx<wA)
for(int m=0;m<wA/TILE_WIDTH;++m)
{
Bs[ty][tx] = B[index_tywA+(mTILE_WIDTH+tx)];
As[ty][tx] = A[index_tywA+(mTILE_WIDTH+tx)];
Cs[ty][tx] = A[(m*TILE_WIDTH+ty)wA+i];
Ds[ty][tx] = B[i*wA+(m*TILE_WIDTH+tx)];
Ms[ty][tx] = A[i*wA+(m*TILE_WIDTH+tx)];
__syncthreads();
/ int k=i-(i/TILE_WIDTH)TILE_WIDTH;/
// const int k=i%wA;
// const int j=i/wA;
Avalue=A[index_ty*wA+i]A[i*wA+index_tx];
Bvalue=A[index_tywA+i]*B[i*wA+index_tx];
Bs[ty][tx]=Bs[ty][tx]-Bvalue;
As[ty][tx]=As[ty][tx]-Avalue;
__syncthreads();
if(index_ty!=i)
{ B[ index_tywA+index_tx]=Bs[ty][tx];
A[ index_tywA+index_tx]=As[ty][tx];
}
}
调用kernel函数
dim3 mygrid((wA+TILE_WIDTH-1) / TILE_WIDTH, (wA+TILE_WIDTH-1) / TILE_WIDTH);
dim3 myblock(TILE_WIDTH, TILE_WIDTH);
start = clock();
for(int i=0;i<wA;i++)
{
matrixInv<<<mygrid,myblock>>>(d_A,d_B,i,wA);
}