From 2b810bbbc452a0eabe6d1f769805752e4ba490b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=9F=E4=BF=8A=E6=88=90?= Date: Mon, 16 Dec 2024 17:05:20 +0800 Subject: [PATCH] partial output support --- impl/matmul/matmul_impl.h | 133 +++++++++++++----- impl/matmul/matmul_macro_v220_impl.h | 38 +++-- .../copy_cube_in/batch/batch_copy_cube_in.h | 2 +- lib/matmul/matmul_config.h | 2 + 4 files changed, 123 insertions(+), 52 deletions(-) diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index cf631ac8..efb5c105 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -3608,56 +3608,107 @@ __aicore__ inline void MatmulImplBase= 220 LocalTensor bias; - if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { - if (unlikely(k == 0 && var.enableBias_)) { - if constexpr (A_TYPE::layout == LayoutMode::NONE || - ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { - bias = var.qidBias_.template DeQue(); + if constexpr (ToMatmulConfig(MM_CFG).isPartialOutput) { + using C_T = typename C_TYPE::T; + GlobalTensor cacheWorkspace; + cacheWorkspace.SetGlobalBuffer(reinterpret_cast<__gm__ C_T*>(var.cacheWorkspaceAddr)); + uint64_t kLoop = MatmulInstr::sMadK_ / MatmulInstr::sMad0K_; + uint64_t kTail = MatmulInstr::sMadK_ - kLoop * MatmulInstr::sMad0K_; + uint64_t l0cOffsetSize = var.baseUseM_ * var.baseUseN_; + if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + if (unlikely(k == 0 && var.enableBias_)) { + MatmulInstr::biasType_ = IsSameType::value ? 2 : 1; // 2:f32, 1:f16 + MatmulInstr::sL1BiasOffset_ = 0; + if constexpr (A_TYPE::layout == LayoutMode::NONE || + ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { + bias =var.qidBias_.template DeQue(); + } else { + bias.SetAddr(var.inputBias_); + bias = bias[var.curN_ * tilingBaseN]; + } } else { - bias.SetAddr(var.inputBias_.address_); - bias = bias[var.curN_ * tilingBaseN]; + MatmulInstr::biasType_ = 0; } - MatmulInstr::biasType_ = IsSameType::value ? 2 : 1; // 2:f32, 1:f16 - MatmulInstr::sL1BiasOffset_ = 0; - if constexpr (IsL0Cache()) { - uint32_t singlePosA = var.curM_ * var.kIter_; - uint32_t singlePosB = var.curN_ * var.kIter_; - MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, - 0, 0, 0, 0, singlePosA, singlePosB); - } else { - MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); + } + for (uint64_t k_inner = 0;k_inner < kLoop; k_inner++) { + MatmulInstr::template Compute(a1, b1, + MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, 0, 0, 0, 0, k_inner, kLoop); + MATMUL_MODULE(CopyCubeOut) + ->template Copy(cacheWorkspace, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), var.curM_, + var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_, var.blockUseN_, true); + cacheWorkspace = cacheWorkspace[l0cOffsetSize]; + } + if (kTail != 0) { + MatmulInstr::template Compute(a1, b1, + MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, 0, 0, 0, 0, 0, kLoop); + MATMUL_MODULE(CopyCubeOut) + ->template Copy(cacheWorkspace, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), var.curM_, + var.curN_, var.baseUseM_, var.baseUseN_, var.blockUseM_, var.blockUseN_, true); + cacheWorkspace = cacheWorkspace[l0cOffsetSize]; + } + var.cacheWorkspaceAddr = (__gm__ uint8_t*)(cacheWorkspace.GetPhyAddr()); + if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + if (unlikely(k == 0 && var.enableBias_)) { + if constexpr (A_TYPE::layout == LayoutMode::NONE || + ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { + var.qidBias_.FreeTensor(bias); + } } - if constexpr (A_TYPE::layout == LayoutMode::NONE || - ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { - var.qidBias_.FreeTensor(bias); + } + } else { + if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + if (unlikely(k == 0 && var.enableBias_)) { + if constexpr (A_TYPE::layout == LayoutMode::NONE || + ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { + bias = var.qidBias_.template DeQue(); + } else { + bias.SetAddr(var.inputBias_.address_); + bias = bias[var.curN_ * tilingBaseN]; + } + MatmulInstr::biasType_ = IsSameType::value ? 2 : 1; // 2:f32, 1:f16 + MatmulInstr::sL1BiasOffset_ = 0; + if constexpr (IsL0Cache()) { + uint32_t singlePosA = var.curM_ * var.kIter_; + uint32_t singlePosB = var.curN_ * var.kIter_; + MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, + 0, 0, 0, 0, singlePosA, singlePosB); + } else { + MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); + } + if constexpr (A_TYPE::layout == LayoutMode::NONE || + ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) { + var.qidBias_.FreeTensor(bias); + } + } else { + MatmulInstr::biasType_ = 0; + if constexpr (IsL0Cache()) { + uint32_t singlePosA = var.curM_ * var.kIter_ + k * var.tiling_.GetStepKa(); + uint32_t singlePosB = var.curN_ * var.kIter_+ k * var.tiling_.GetStepKb(); + MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, + 0, 0, 0, 0, singlePosA, singlePosB); + } else { + MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); + } } } else { - MatmulInstr::biasType_ = 0; if constexpr (IsL0Cache()) { uint32_t singlePosA = var.curM_ * var.kIter_ + k * var.tiling_.GetStepKa(); uint32_t singlePosB = var.curN_ * var.kIter_+ k * var.tiling_.GetStepKb(); - MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, - 0, 0, 0, 0, singlePosA, singlePosB); + MatmulInstr::template Compute(a1, b1, + MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, singlePosA, singlePosB); } else { - MatmulInstr::template Compute(a1, b1, MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); + MatmulInstr::template Compute(a1, b1, + MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); } } - } else { - if constexpr (IsL0Cache()) { - uint32_t singlePosA = var.curM_ * var.kIter_ + k * var.tiling_.GetStepKa(); - uint32_t singlePosB = var.curN_ * var.kIter_+ k * var.tiling_.GetStepKb(); - MatmulInstr::template Compute(a1, b1, - MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias, 0, 0, 0, 0, singlePosA, singlePosB); - } else { - MatmulInstr::template Compute(a1, b1, - MATMUL_MODULE(CubeOutBuffer)->GetTensor(), bias); - } } #elif __CCE_AICORE__ == 200 if (var.enableBias_) { @@ -3695,6 +3746,10 @@ __aicore__ inline void MatmulImplBaseReset(); } } + if constexpr (ToMatmulConfig(MM_CFG).isPartialOutput) { + auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor(); + MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local); + } } template + bool isNormOuter = false, bool isPartialOutput = false> inline __aicore__ void Compute(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, - const LocalTensor &cMatrix, const LocalTensor &bias, - int64_t offsetb = 0, uint8_t subIdx = 0, uint16_t sMadMStep = 0, uint16_t sMadNStep = 0, - uint32_t posA = 0, uint32_t posB = 0, uint16_t sBaseM = 0, uint16_t sBaseN = 0); + const LocalTensor &cMatrix, const LocalTensor &bias, int64_t offsetb = 0, uint8_t subIdx = 0, + uint16_t sMadMStep = 0, uint16_t sMadNStep = 0, uint32_t posA = 0, uint32_t posB = 0, uint16_t sBaseM = 0, + uint16_t sBaseN = 0, uint32_t kPartial = 0, uint32_t kPartialLoop = 0); template inline __aicore__ void ComputeWithMdb(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, @@ -552,21 +552,26 @@ inline __aicore__ void MacroMatmul -template +template inline __aicore__ void MacroMatmul>::Compute( const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, const LocalTensor &bias, int64_t offsetb, uint8_t subIdx, uint16_t sMadMStep, uint16_t sMadNStep, - uint32_t posA, uint32_t posB, uint16_t sBaseM, uint16_t sBaseN) + uint32_t posA, uint32_t posB, uint16_t sBaseM, uint16_t sBaseN, uint32_t kPartial, uint32_t kPartialLoop) { uint16_t madKC0 = CeilDiv(sMadK_, GetHwK0()); uint32_t nFraC0 = CeilDiv(sMadN_, HW_N0); uint64_t kC0 = sMad0K_ / GetHwK0(); uint64_t kLoop; - if constexpr (noTail) { - kLoop = 1; + if constexpr (isPartialOutput) { + kLoop = kPartialLoop; } else { - kLoop = sMadK_ / sMad0K_; // loop times of sMad0K_ + if constexpr (noTail) { + kLoop = 1; + } else { + kLoop = sMadK_ / sMad0K_; // loop times of sMad0K_ + } } uint64_t kC0Tail = madKC0 - kLoop * kC0; // tail block loop times, unit is 16 uint64_t kTail; @@ -650,12 +655,18 @@ inline __aicore__ void MacroMatmul(ssAl0PingPongFlag_ & 0x1); + uint16_t kIdx; + if constexpr (isPartialOutput) { + kIdx = kPartial; + } else { + kIdx = k_inner; + } // load L0A - uint64_t aPoskPtr = k_inner * kC0 * GetHwK0() + sAL1KOffset_; - LoadL12L0A(k_inner, aPoskPtr, sMad0K_, l1AMatrix, l0a); + uint64_t aPoskPtr = kIdx * kC0 * GetHwK0() + sAL1KOffset_; + LoadL12L0A(kIdx, aPoskPtr, sMad0K_, l1AMatrix, l0a); // load L0B if constexpr(!intraBlockPartSum) { - LoadL12L0B(k_inner, sMad0K_, l1BMatrix, l0b); + LoadL12L0B(kIdx, sMad0K_, l1BMatrix, l0b); } else { l0b = l0b[offsetb]; } @@ -691,6 +702,9 @@ inline __aicore__ void MacroMatmul::IsNeedUB()) && GetCopyCubeInType() == CopyCubeInType::BMM && INPUT_TYPE::format == CubeFormat::NZ && - INPUT_TYPE::layout == LayoutMode::NORMAL>> + ((INPUT_TYPE::layout == LayoutMode::NORMAL) || (INPUT_TYPE::layout == LayoutMode::BNGS1S2))>> { MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h index 69e38a98..a1403f1e 100644 --- a/lib/matmul/matmul_config.h +++ b/lib/matmul/matmul_config.h @@ -128,6 +128,7 @@ struct MatmulConfig { bool enableDoubleCache; bool isBiasBatch = true; bool enableStaticPadZeros = false; + bool isPartialOutput = false; }; enum class MatmulConfigMode { @@ -167,6 +168,7 @@ struct MatmulFuncParams { ScheduleType scheduleType; bool enableReuse = true; bool enableUBReuse; + bool isPartialOutput = false; }; struct MatrixOffset { -- Gitee