From bf120042d512a2ab35c48411c9fd01b4e4976536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=98=AE=E8=B5=A2=E6=B4=8B?= Date: Tue, 28 Oct 2025 15:21:54 +0800 Subject: [PATCH] change bmmOutMode datatype --- impl/matmul/batch_matmul_impl.h | 2 +- impl/matmul/kfc/matmul_server_aux.h | 2 ++ impl/matmul/matmul_impl.h | 4 ++++ impl/matmul/scheduler/batch/batch_scheduler.h | 10 +++++----- .../scheduler/batch/batch_scheduler_base.h | 9 ++++++--- .../stage/copy_cube_out/copy_cube_out_fixpipe.h | 4 ++-- impl/matmul/utils/matmul_config_impl.h | 2 +- lib/matmul/matmul_client.h | 7 +++++-- lib/matmul/matmul_config.h | 10 ++++++++-- lib/matmul/tiling.h | 16 ++++++++-------- 10 files changed, 42 insertions(+), 24 deletions(-) diff --git a/impl/matmul/batch_matmul_impl.h b/impl/matmul/batch_matmul_impl.h index 3e3a114b..10e355c2 100644 --- a/impl/matmul/batch_matmul_impl.h +++ b/impl/matmul/batch_matmul_impl.h @@ -128,7 +128,7 @@ public: __aicore__ inline void SetNBatchOutNum(int32_t nBatchOutNumIn) { int32_t nBatchOutNum = 1; - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { nBatchOutNum = nBatchOutNumIn; } MATMUL_MODULE(BatchScheduler)->SetNBatchOutNum(nBatchOutNum); diff --git a/impl/matmul/kfc/matmul_server_aux.h b/impl/matmul/kfc/matmul_server_aux.h index 66bf5ac5..0199cdea 100644 --- a/impl/matmul/kfc/matmul_server_aux.h +++ b/impl/matmul/kfc/matmul_server_aux.h @@ -260,6 +260,7 @@ public: __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { #if defined(__DAV_C310__) WaitAB(); @@ -282,6 +283,7 @@ public: __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { #if (__CCE_AICORE__ == 220) ASSERT("IterateAll localTensor not support when enableMixDualMaster is enabled"); diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 4a347846..d69dc7ba 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -54,6 +54,7 @@ public: __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); #if __CCE_AICORE__ == 200 GlobalTensor global; global.SetGlobalBuffer((__gm__ uint64_t*)0); @@ -81,6 +82,7 @@ public: template __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); #if __CCE_AICORE__ == 200 GlobalTensor global; global.SetGlobalBuffer((__gm__ uint64_t*)0); @@ -100,6 +102,7 @@ public: __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr(BASE_MODULE::POLICY::POLICY_TYPE == PolicyType::MATMUL_NBUFFER_33) { static_assert(DoMatmulMDL(MM_CFG), "NBuffer33MatmulPolicy only support MDL config."); MATMUL_MODULE(Scheduler)->Schedule(gm, enAtomic, enSequentialWrite); @@ -119,6 +122,7 @@ public: template __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr(BASE_MODULE::POLICY::POLICY_TYPE == PolicyType::MATMUL_NBUFFER_33) { static_assert(DoMatmulMDL(MM_CFG), "NBuffer33MatmulPolicy only support MDL config."); MATMUL_MODULE(Scheduler)->Schedule(ubCmatrix, enAtomic); diff --git a/impl/matmul/scheduler/batch/batch_scheduler.h b/impl/matmul/scheduler/batch/batch_scheduler.h index 993b5c26..8bb1decf 100644 --- a/impl/matmul/scheduler/batch/batch_scheduler.h +++ b/impl/matmul/scheduler/batch/batch_scheduler.h @@ -70,7 +70,7 @@ public: auto batchOffsetInfo = PrepareOffset(); auto ctx = BASE_MODULE::PrepareContext(); const auto batchLoop = MATMUL_MODULE(BatchLoop); - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { batchLoop->SetBatchOutCacheNum(0); batchLoop->SetBatchOutOffsetNum(0); } @@ -196,7 +196,7 @@ private: (batchIndex % batchOffsetInfo.modB + batchIndex / batchOffsetInfo.divisorB); ctx.offsetBias = batchOffsetInfo.alignBias * (batchIndex % batchOffsetInfo.modBias + batchIndex / batchOffsetInfo.divisorBias); - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { bL->SetBatchOutCacheNum(bL->GetBatchOutCacheNum() + 1); } } else { @@ -215,7 +215,7 @@ private: ctx.offsetBias = batchOffsetInfo.alignBias * (biasIndex % batchOffsetInfo.modBias + biasIndex / batchOffsetInfo.divisorBias); - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { bL->SetBatchOutCacheNum(bL->GetBatchOutCacheNum() + 1); } } @@ -399,7 +399,7 @@ private: uint8_t enAtomic, bool enSequentialWrite) { const auto batchLoop = MATMUL_MODULE(BatchLoop); - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { if (batchLoop->IsNeedNBatchOut()) { BASE_MODULE::GetBatchResult(dst, ctx, enAtomic, enSequentialWrite); } else { @@ -413,7 +413,7 @@ private: __aicore__ inline void EndIterate() { - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { const auto batchLoop = MATMUL_MODULE(BatchLoop); if (batchLoop->IsNeedNBatchOut()) { batchLoop->SetBatchOutOffsetNum(batchLoop->GetBatchOutOffsetNum() + batchLoop->GetBatchOutCacheNum()); diff --git a/impl/matmul/scheduler/batch/batch_scheduler_base.h b/impl/matmul/scheduler/batch/batch_scheduler_base.h index 3f39ac89..7653b1c1 100644 --- a/impl/matmul/scheduler/batch/batch_scheduler_base.h +++ b/impl/matmul/scheduler/batch/batch_scheduler_base.h @@ -93,7 +93,7 @@ public: lenFactor = DOUBLE_SIZE; } - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { MATMUL_MODULE(CubeOutBuffer)->Init(tiling.GetBaseM() * tiling.GetBaseN() * nBatchOutNum_, lenFactor); } else { MATMUL_MODULE(CubeOutBuffer)->Init(tiling.GetBaseM() * tiling.GetBaseN(), lenFactor); @@ -246,6 +246,9 @@ public: if (unlikely(isFirstIter_)) { return MoveOnFirstIterate(); } else { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode == BatchOutMode::MULTI_BATCH) { + return false; + } if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::UNDEF) { if (likely(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == static_cast(IterateOrder::ORDER_M))) { return MoveOnIterateOrderM(); @@ -345,7 +348,7 @@ public: bool cmatrixInitVal; UpdateMmadComputeParams(sL0CInit, cmatrixSource, cmatrixInitVal); int32_t offsetC = 0; - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { offsetC = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * (MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum() - 1); @@ -397,7 +400,7 @@ public: ASSERT(false && "Can not support other Layout"); } - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { CopyOut(dst[MATMUL_MODULE(BatchLoop)->GetBatchOutOffsetNum() * stride], enAtomic, enSequentialWriteIn); } else { uint32_t iBatch = ctx.isReduceG ? (MATMUL_MODULE(BatchLoop)->GetBatchIndex() / ctx.reduceGNum) : diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h index 2dfec805..9e23fbf2 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h @@ -150,7 +150,7 @@ private: FixpipeAdaptor fixpipe(baseWidth, baseHeight, baseBlockWidth, baseBlockHeight, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), stride); - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { fixpipe.SetNdParams(MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(), baseHeight, baseWidth, baseBlockWidth, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); @@ -324,7 +324,7 @@ private: baseHeight = CeilAlign(baseHeight, BLOCK_CUBE); } } - if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if constexpr (ToMatmulConfig(MM_CFG).bmmOutMode != BatchOutMode::SINGLE_BATCH) { baseWidth *= MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(); baseBlockWidth *= MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(); } diff --git a/impl/matmul/utils/matmul_config_impl.h b/impl/matmul/utils/matmul_config_impl.h index 474d9f2f..7d2c748e 100644 --- a/impl/matmul/utils/matmul_config_impl.h +++ b/impl/matmul/utils/matmul_config_impl.h @@ -33,7 +33,7 @@ __aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, ArgType arg) cfg.isNBatch = arg.isNBatch; cfg.batchMode = arg.batchMode; cfg.isBiasBatch = arg.isBiasBatch; - cfg.isNBatchOut = arg.isNBatchOut; + cfg.bmmOutMode = arg.bmmOutMode; } else if constexpr (AscendC::IsSameType::value) { cfg.intrinsicsCheck = arg.intrinsicsCheck; cfg.enVecND2NZ = arg.enVecND2NZ; diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index ce936845..43061fb2 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -800,8 +800,8 @@ public: __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { - static_assert(!(ToMatmulConfig(MM_CFG).enableMixDualMaster && !(A_TYPE::ibShare && B_TYPE::ibShare)), - "IBShare in A/BTYPE should be true when enableMixDualMaster is enabled."); + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); + static_assert(!(ToMatmulConfig(MM_CFG).enableMixDualMaster && !(A_TYPE::ibShare && B_TYPE::ibShare)), "IBShare in A/BTYPE should be true when enableMixDualMaster is enabled."); if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { #if ASCENDC_CPU_DEBUG if ASCEND_IS_AIC { @@ -856,6 +856,7 @@ public: __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); static_assert(!(ToMatmulConfig(MM_CFG).enableMixDualMaster && !(A_TYPE::ibShare && B_TYPE::ibShare)), "IBShare in A/BTYPE should be true when enableMixDualMaster is enabled."); TRACE_START(TraceId::KFC_CLIENT_POST_MSG); @@ -901,6 +902,7 @@ public: __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { constexpr uint16_t eventID = 9U; #if ASCENDC_CPU_DEBUG @@ -946,6 +948,7 @@ public: template __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) { + ASCENDC_ASSERT((!ToMatmulConfig(MM_CFG).isPartialOutput), { KERNEL_LOG(KERNEL_ERROR, "IterateAll is not supported for PartialOutput."); }); if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster){ #if (__CCE_AICORE__ == 220) ASSERT("IterateAll localTensor not support when enableMixDualMaster is enabled"); diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h index ab7711fd..466113ad 100644 --- a/lib/matmul/matmul_config.h +++ b/lib/matmul/matmul_config.h @@ -55,6 +55,12 @@ enum class BatchMode { SINGLE_LARGE_THAN_L1 }; +enum BatchOutMode { + SINGLE_BATCH = 0, + MULTI_BATCH, + DYNAMIC, +}; + enum class IterateOrder { ORDER_M = 0, ORDER_N, @@ -140,7 +146,7 @@ struct MatmulConfig { bool enableKdimReorderLoad = false; bool isCO1Shared = false; uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE; - bool isNBatchOut = false; + BatchOutMode bmmOutMode = BatchOutMode::SINGLE_BATCH; }; enum class MatmulConfigMode { @@ -168,7 +174,7 @@ struct MatmulBatchParams { bool isNBatch; BatchMode batchMode; bool isBiasBatch = true; - bool isNBatchOut = false; + BatchOutMode bmmOutMode = BatchOutMode::SINGLE_BATCH; }; struct MatmulFuncParams { diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h index 1629f4d7..e9c62e51 100644 --- a/lib/matmul/tiling.h +++ b/lib/matmul/tiling.h @@ -29,14 +29,14 @@ * @param [in] scheduleType: set matmul data transfer mode * @param [in] enUnitFlag: whether to enable UnitFlag * @param [in] enableMixDualMaster: whether to enable MixDualMaster - * @param [in] isNBatchOut: set multi-batch output mode + * @param [in] bmmOutMode: set multi-batch output mode * @return MatmulConfig with normal setting */ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF, const ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, const bool enUnitFlag = true, - const bool enableMixDualMaster = false, const bool isNBatchOut = false) + const bool enableMixDualMaster = false, const BatchOutMode bmmOutMode = BatchOutMode::SINGLE_BATCH) { return { .doNorm = true, @@ -88,7 +88,7 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f .enableKdimReorderLoad = false, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = isNBatchOut + .bmmOutMode = bmmOutMode }; } @@ -164,7 +164,7 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals .enableKdimReorderLoad = enableKdimReorderLoad, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = false + .bmmOutMode = BatchOutMode::SINGLE_BATCH }; } @@ -234,7 +234,7 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit .enableKdimReorderLoad = false, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = false + .bmmOutMode = BatchOutMode::SINGLE_BATCH }; } @@ -304,7 +304,7 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui .enableKdimReorderLoad = false, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = false + .bmmOutMode = BatchOutMode::SINGLE_BATCH }; } @@ -380,7 +380,7 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c .enableKdimReorderLoad = false, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = false + .bmmOutMode = BatchOutMode::SINGLE_BATCH }; } @@ -450,7 +450,7 @@ __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimi .enableKdimReorderLoad = false, .isCO1Shared = false, .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, - .isNBatchOut = false + .bmmOutMode = BatchOutMode::SINGLE_BATCH }; } -- Gitee