matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
{
// Block index
size_type bx = blockIdx.x;
size_type by = blockIdx.y;
// Thread index
size_type tx = threadIdx.x;
size_type ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by;
// Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
size_type aStep = block_size;
// Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx;
// Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{ // printf("the WB is %d\n",wB);
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[block_size][block_size];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[block_size][block_size];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
今天在学习这个矩阵相乘的例子
但是实在没有看懂
我也在纸上画了画矩阵 还是没有懂
第一遍的loop是什么作用呢?
这个矩阵遍历方法经常应用到矩阵相乘的计算中吗?
初学者还有一个问题,就是如何更好地学习CUDA编程 本人编程基础一般,多谢版主指点一下。