两种内存拷贝计算的结果不同

hollow_knight2 · 2025 年2 月 22 日 03:18

//必须使用32位的地址，因为是存储到寄存器中
     uint32_t As_address = __cvta_generic_to_shared(&As[0]) + ((innerRowA + offset) * BK + innerColA * 4) * sizeof(float);
     float4* A_address = (float4*)(A + (((innerRowA + offset) * K + innerColA * 4) * sizeof(float)));
     CP_ASYNC_CG(As_address, A_address, COPY_ASYNC_A_SIZE);
     //
     reinterpret_cast<float4 *>(
       &As[(innerRowA + offset) * BK + innerColA * 4])[0] =
       reinterpret_cast<const float4 *>(
           &A[((innerRowA + offset) * K + innerColA * 4)])[0];

一种是使用CP的ASYNC模式进行拷贝，一种是直接拷贝，同步之后进行计算的结果不同。
以下是CP宏的代码
define CP_ASYNC_CG(dst, src, Bytes)
asm volatile(“cp.async.cg.shared.global [%0], [%1], %2;\n” ::“r”(dst), “l”(src), “n”(Bytes))

以下是整体的代码

#include "sgemm_maxwell.cuh"
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
typedef unsigned int uint;
const int WARPSIZE = 32;
//ldmatrix: tensorcore的使用要求共享内存大小应该等于tensorcore所操作的内存大小

//定义一次异步拷贝16个字节 = 128bit
#define COPY_ASYNC_A_SIZE 16
#define COPY_ASYNC_B_SIZE 16

namespace wt {
  //加载数据的As和Bs中，与warp无关
  template <const int BM, const int BN, const int BK, const int rowStrideA,
            const int rowStrideB>
  __device__ void loadFromGmem(int N, int K, const float *A, const float *B,
                               float *As, float *Bs, int innerRowA, int innerColA,
                               int innerRowB, int innerColB) {
    //一次性块只加载BK * rowStrideA的数据
    for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) {
      //必须使用32位的地址，因为是存储到寄存器中
      uint32_t As_address = __cvta_generic_to_shared(&As[0]) + ((innerRowA + offset) * BK + innerColA * 4) * sizeof(float);
      float4* A_address = (float4*)(A + (((innerRowA + offset) * K + innerColA * 4) * sizeof(float)));
      CP_ASYNC_CG(As_address, A_address, COPY_ASYNC_A_SIZE);
      //
      reinterpret_cast<float4 *>(
        &As[(innerRowA + offset) * BK + innerColA * 4])[0] =
        reinterpret_cast<const float4 *>(
            &A[((innerRowA + offset) * K + innerColA * 4)])[0];
    }
    for (uint offset = 0; offset + rowStrideB <= BK; offset += rowStrideB) {
      //必须使用32位的地址，因为是存储到寄存器中
      uint32_t Bs_address = __cvta_generic_to_shared(&Bs[0]) + ((innerRowB + offset) * BN + innerColB * 4) * sizeof(float);
      float4* B_address = (float4*)(B + (((innerRowB + offset) * N + innerColB * 4) * sizeof(float)));
      CP_ASYNC_CG(Bs_address, B_address, COPY_ASYNC_B_SIZE);
    }
  }

  //处理smem的数据，进行点积，与warp有关
  //所有这些参数都是对应的C矩阵(除了BK、BN、BM)
  template <const int BM, const int BN, const int BK, const int WM, const int WN,
            const int WMITER, const int WNITER, const int WSUBM, const int WSUBN,
            const int TM, const int TN>
  __device__ void
  processFromSmem(float *regM, float *regN, float *threadResults, const float *As,
                  const float *Bs, const uint warpRow, const uint warpCol,
                  const uint threadRowInWarp, const uint threadColInWarp) {
    //dotIdx负责取出BK个元素：BK乘以BK才能代表一个结果
    //dotIdx保证一个线程遍历到所有它要处理的元素
    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
      // populate registers for whole warptile
      // regM可以存储多个warpTile的寄存器元素：加载WMITER x TM个元素
      for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
        //加载一行TM个元素到寄存器中
        for (uint i = 0; i < TM; ++i) { //加载TM个元素
          // 不再使用转置：As: BM x BK
          // - dotIdx * BM: 当前 dot product 的起始行
          // - warpRow * WM: warp 在 warptile 中的行偏移
          // - wSubRowIdx * WSUBM: 当前子行的偏移
          // - threadRowInWarp * TM: 线程在 warp 中的行偏移
          // - i: 当前线程负责的元素索引
          // - A每次加载一列元素
          regM[wSubRowIdx * TM + i] = As[dotIdx + (warpRow * WM + wSubRowIdx * WSUBM + threadRowInWarp * TM + i) * BK];
        }
      }

      for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
        // regN可以存储多个warpTile的寄存器元素：加载WNITER x TN个元素()
        for (uint i = 0; i < TN; ++i) {
          // - dotIdx * BN: 当前 dot product 的起始行：用于遍历所有B的行
          // - warpCol * WN: warp 在 warptile 中的列偏移 -> 偏移到第warpCol个warp
          // - wSubColIdx * WSUBN: 当前子列的偏移 -> 偏移到当前子列
          // - threadColInWarp * TN: 线程在 warp 中的列偏移
          // - i: 当前线程负责的元素索引
          // - warpCol * WN + wSubColIdx * WSUBN + threadColInWarp * TN + i：找C对应的列
          regN[wSubColIdx * TN + i] = Bs[(dotIdx * BN) + warpCol * WN + wSubColIdx * WSUBN + threadColInWarp * TN + i];
        }
      }

      // execute warptile matmul
      for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
        for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
          // calculate per-thread results
          for (uint resIdxM = 0; resIdxM < TM; ++resIdxM) {
            for (uint resIdxN = 0; resIdxN < TN; ++resIdxN) {
              //wSubRowIdx * TM + resIdxM：行
              //wSubColIdx * TN + resIdxN：列
              threadResults[(wSubRowIdx * TM + resIdxM) * (WNITER * TN) +
                            (wSubColIdx * TN) + resIdxN] +=
                  regM[wSubRowIdx * TM + resIdxM] *
                  regN[wSubColIdx * TN + resIdxN];
            }
          }
        }
      }
    }
  }
  
} // namespace wt


template <const int BM, const int BN, const int BK, const int WM, const int WN,
          const int WNITER, const int TM, const int TN, const int NUM_THREADS>
__global__ void __launch_bounds__(NUM_THREADS)
sgemm_tensorcore_kernel(int M, int N, int K, float alpha, float *A, float *B,
                    float beta, float *C) {
  const uint cRow = blockIdx.y;
  const uint cCol = blockIdx.x;

  // Placement of the warp in the threadblock tile
  // warp在线程块分块中的位置
  // BN / WN = 处理一块在x方向上需要warp数量 
  const uint warpIdx = threadIdx.x / WARPSIZE; // the warp this thread is in
  const uint warpCol = warpIdx % (BN / WN);
  const uint warpRow = warpIdx / (BN / WN);

  // size of the warp subtile
  // warp子分块的大小: WM * WN = WARPSIZE * TM * TN * WNITER * WMITER
  constexpr uint WMITER = (WM * WN) / (WARPSIZE * TM * TN * WNITER);
  // WSUBM、WSUBN：每次迭代时在m维度和n维度的warp处理的数据数量
  constexpr uint WSUBM = WM / WMITER; // 64/2=32
  constexpr uint WSUBN = WN / WNITER; // 32/2=16

  // Placement of the thread in the warp subtile
  // 线程在warp子分块中的位置
  const uint threadIdxInWarp = threadIdx.x % WARPSIZE;         // [0, 31]
  // WSUBN / TN = 处理一块warp数据在x方向上需要的线程数量
  // 单个WarpTile内的线程坐标
  const uint threadColInWarp = threadIdxInWarp % (WSUBN / TN); // i%(16/4)
  const uint threadRowInWarp = threadIdxInWarp / (WSUBN / TN); // i/4

  // allocate space for the current blocktile in SMEM
  __shared__ float As[BM * BK];
  __shared__ float Bs[BK * BN];
  
  // A、B移动块到对应位置
  // Move blocktile to beginning of A's row and B's column
  A += cRow * BM * K;
  B += cCol * BN;
  // C移动到warp对应位置
  // Move C_ptr to warp's output tile
  C += cRow * BM * N + cCol * BN + warpRow * WM * N + warpCol * WN;

  // calculating the indices that this thread will load into SMEM
  // we'll load 128bit / 32bit = 4 elements per thread at each step
  // 计算该线程将在每个步骤中加载到共享内存（SMEM）的索引
  // 每个步骤中，每个线程将加载128位 / 32位 = 4个元素
  const uint innerRowA = threadIdx.x / (BK / 4);
  const uint innerColA = threadIdx.x % (BK / 4);
  // 计算单个线程块一次性处理的行数(stride)：注意NUM_THREADS * 4并不等于线程块要处理的元素总数量
  // BM = rowStrideA * t(t是一个整数)
  constexpr uint rowStrideA = (NUM_THREADS * 4) / BK;
  const uint innerRowB = threadIdx.x / (BN / 4);
  const uint innerColB = threadIdx.x % (BN / 4);
  // BK = rowStrideB * t(t是一个整数)
  // modify##
  constexpr uint rowStrideB = (NUM_THREADS * 4) / BN;

  // allocate thread-local cache for results in registerfile
  // WMITER、WNITER为warp在对应维度的迭代数量
  // warp中单个线程处理的元素数量
  float threadResults[WMITER * TM * WNITER * TN] = {0.0};
  // we cache into registers on the warptile level
  float regM[WMITER * TM] = {0.0};
  float regN[WNITER * TN] = {0.0};

  // outer-most loop over block tiles
  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
    wt::loadFromGmem<BM, BN, BK, rowStrideA, rowStrideB>(
        N, K, A, B, As, Bs, innerRowA, innerColA, innerRowB, innerColB);
    //额外进行拷贝组处理
    CP_ASYNC_COMMIT_GROUP();
    CP_ASYNC_WAIT_GROUP(0);
    __syncthreads();
    wt::processFromSmem<BM, BN, BK, WM, WN, WMITER, WNITER, WSUBM, WSUBN, TM,
                        TN>(regM, regN, threadResults, As, Bs, warpRow, warpCol,
                            threadRowInWarp, threadColInWarp);
    A += BK;     // move BK columns to right
    B += BK * N; // move BK rows down
    __syncthreads();
  }

  // write out the results
  for (uint wSubRowIdx = 0; wSubRowIdx < WMITER; ++wSubRowIdx) {
    for (uint wSubColIdx = 0; wSubColIdx < WNITER; ++wSubColIdx) {
      // move C pointer to current warp subtile
      // 定位到当前 warp 负责的 C 矩阵子块内存位置
      // WSUBM/WSUBN 表示每个 warp 子块的行/列尺寸
      float *C_interim = C + (wSubRowIdx * WSUBM) * N + wSubColIdx * WSUBN;
      // 遍历当前线程负责的 TM 个行元素（按行展开）
      for (uint resIdxM = 0; resIdxM < TM; resIdxM += 1) {
        // 按 4 元素为一组处理（利用 float4 向量化加载/存储）
        for (uint resIdxN = 0; resIdxN < TN; resIdxN += 4) {
          // load C vector into registers
          float4 tmp = reinterpret_cast<float4 *>(
              &C_interim[(threadRowInWarp * TM + resIdxM) * N +
                          threadColInWarp * TN + resIdxN])[0];
          // perform GEMM update in reg
          const int i = (wSubRowIdx * TM + resIdxM) * (WNITER * TN) +
                        wSubColIdx * TN + resIdxN;
          tmp.x = alpha * threadResults[i + 0] + beta * tmp.x;
          tmp.y = alpha * threadResults[i + 1] + beta * tmp.y;
          tmp.z = alpha * threadResults[i + 2] + beta * tmp.z;
          tmp.w = alpha * threadResults[i + 3] + beta * tmp.w;
          // write back
          // 将更新后的 4 个元素写回全局内存
          // 使用相同的地址计算，但反向操作（寄存器 -> 全局内存）
          reinterpret_cast<float4 *>(
              &C_interim[(threadRowInWarp * TM + resIdxM) * N +
                          threadColInWarp * TN + resIdxN])[0] = tmp;
        }
      }
    }
  }
}

extern "C" void sgemm_tensorcore(fg::Tensor<float> &A, fg::Tensor<float> &B, fg::Tensor<float> &C){
  // 设置设备指针并为a、b、c分配设备内存
  A.to_device_ptr();
  B.to_device_ptr();
  C.to_device_ptr();
  int m = A.layout_->get_shape()[0];
  int k = A.layout_->get_shape()[1];
  int n = B.layout_->get_shape()[1];
  int M = m, K = k, N = n;
  float alpha = 1, beta = 0;
  // 拷贝A、B矩阵
  fg::checkCudaErrors(cudaMemcpy(A.get_device_ptr(), A.get_data(), A.get_element_num() * sizeof(float), cudaMemcpyHostToDevice));
  fg::checkCudaErrors(cudaMemcpy(B.get_device_ptr(), B.get_data(), B.get_element_num() * sizeof(float), cudaMemcpyHostToDevice));
  const uint K10_NUM_THREADS = 128;
  const uint K10_BN = 128;
  const uint K10_BM = 128;
  const uint K10_BK = 16;
  const uint K10_WN = 64;
  const uint K10_WM = 64;
  const uint K10_WNITER = 4;
  const uint K10_TN = 4;
  const uint K10_TM = 8;
  dim3 blockDim(K10_NUM_THREADS);
  dim3 gridDim(CEIL_DIV(N, K10_BN), CEIL_DIV(M, K10_BM));
  sgemm_tensorcore_kernel<K10_BM, K10_BN, K10_BK, K10_WM, K10_WN, K10_WNITER, K10_TM, K10_TN, K10_NUM_THREADS>
    <<<gridDim, blockDim >>> (M, N, K, alpha, A.get_device_ptr(), B.get_device_ptr(), beta, C.get_device_ptr());
  // 把结果拷贝回C矩阵
  fg::checkCudaErrors(cudaMemcpy(C.get_data(), C.get_device_ptr(), C.get_element_num() * sizeof(float), cudaMemcpyDeviceToHost));
}

有人能告诉我为什么吗？谢谢