From d7934dc435d2f235f927b27270d54df006c23210 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:27:34 +0000 Subject: [PATCH 01/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/batch_matmul_impl.h | 15 +++- impl/matmul/feature_trait/matmul_chip_cap.h | 26 +++--- impl/matmul/matmul_impl.h | 14 +-- impl/matmul/matmul_impl_base.h | 96 +++++++++++++++++++-- 4 files changed, 126 insertions(+), 25 deletions(-) diff --git a/impl/matmul/batch_matmul_impl.h b/impl/matmul/batch_matmul_impl.h index adb85b1b..3e3a114b 100644 --- a/impl/matmul/batch_matmul_impl.h +++ b/impl/matmul/batch_matmul_impl.h @@ -40,6 +40,7 @@ private: public: MATMUL_ALLOW_USING(CopyCubeInA); MATMUL_ALLOW_USING(CopyCubeInB); + MATMUL_ALLOW_USING(CopyCubeOut); MATMUL_ALLOW_USING(Scheduler); MATMUL_ALLOW_USING(BatchScheduler); MATMUL_ALLOW_USING_PRIVATE(BatchCopyCubeInParamsA); @@ -65,10 +66,10 @@ private: MATMUL_USE_MODULE(BatchCopyCubeInB); MATMUL_USE_MODULE(BatchLoop); - using ChosenCopyCubeInA = typename AscendC::Conditional() != Impl::Detail::CopyCubeInType::BMM, + using ChosenCopyCubeInA = typename AscendC::Conditional, MM_CFG>() != Impl::Detail::CopyCubeInType::BMM, CopyCubeInA, BatchCopyCubeInA>::type; - using ChosenCopyCubeInB = typename AscendC::Conditional() != Impl::Detail::CopyCubeInType::BMM, + using ChosenCopyCubeInB = typename AscendC::Conditional, MM_CFG>() != Impl::Detail::CopyCubeInType::BMM, CopyCubeInB, BatchCopyCubeInB>::type; MATMUL_USE_MODULE(ChosenCopyCubeInA); MATMUL_USE_MODULE(ChosenCopyCubeInB); @@ -124,6 +125,16 @@ public: MATMUL_MODULE(BatchLoop)->SetBatchNum(batchA, batchB); } + __aicore__ inline void SetNBatchOutNum(int32_t nBatchOutNumIn) + { + int32_t nBatchOutNum = 1; + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + nBatchOutNum = nBatchOutNumIn; + } + MATMUL_MODULE(BatchScheduler)->SetNBatchOutNum(nBatchOutNum); + MATMUL_MODULE(BatchLoop)->SetNBatchOutNum(nBatchOutNum); + } + __aicore__ inline void IterateBatch(const GlobalTensor& gm, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0) diff --git a/impl/matmul/feature_trait/matmul_chip_cap.h b/impl/matmul/feature_trait/matmul_chip_cap.h index 64b9ab93..c91c77a5 100644 --- a/impl/matmul/feature_trait/matmul_chip_cap.h +++ b/impl/matmul/feature_trait/matmul_chip_cap.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -32,8 +32,8 @@ class MatmulChipCap public: struct Feature { bool supportUnitFlag; - bool ifNeedUB; - bool ifSupportUBToL1; + bool ifNeedUB; // if some func needs vec, such as nd2nz + bool ifSupportUBToL1; // if cube support ub to l1 bool supportMNL0DB; FixpipeParamsType fixpipeParamsType; bool ifSupportLoad3dV2; @@ -42,6 +42,9 @@ public: bool ifSupportCmatrixInitVal; bool ifSupportFmatrixB; bool ifSupportUserDefine; + bool ifSupportUBToL1Singleshape; // if support ub to l1 fullload, from vec + bool ifMmadInstrSupportAntiQuant; + bool ifSupportL0CToUB; bool ifSupportTrianMatmul; }; @@ -70,7 +73,7 @@ private: return CHIP_TYPE_220; #elif __CCE_AICORE__ == 300 return CHIP_TYPE_300; - #elif __CCE_AICORE__ == 310 + #elif defined(__DAV_C310__) return CHIP_TYPE_310; #else return CHIP_TYPE_MAX; @@ -79,14 +82,15 @@ private: private: constexpr static Feature features[CHIP_TYPE_MAX] = { - /* supportUnitFlag, ifNeedUB, ifSupportUBToL1, supportMNL0DB, fixpipeParamsType, ifSupportLoad3dV2, ifSupportLoad2dTranspose, ifSupportLoad2dV2, ifOnlyUseIsBiasForMmad, ifSupportFmatrixB, ifSupportUserDefine, ifSupportTrianMatmul*/ - /*100*/ {false, true, true, false, FixpipeParamsType::NONE, false, false, false, true, false, false, false}, - /*200*/ {false, true, true, false, FixpipeParamsType::NONE, true, false, false, false, false, false, false}, - /*220*/ {true, false, false, true, FixpipeParamsType::V220, true, true, false, false, true, true, true}, - /*300*/ {true, false, true, false, FixpipeParamsType::V220, true, true, false, false, true, false, false}, - /*310*/ {true, false, false, false, FixpipeParamsType::V310, true, true, true, false, true, false, false}}; + /*supportUnitFlag, ifNeedUB, ifSupportUBToL1, supportMNL0DB, fixpipeParamsType, + ifSupportLoad3dV2, ifSupportLoad2dTranspose, ifSupportLoad2dV2, + ifOnlyUseIsBiasForMmad, ifSupportFmatrixB, ifSupportUserDefine, ifSupportUBToL1Singleshape, ifMmadInstrSupportAntiQuant, ifSupportL0CToUB, ifSupportTrianMatmul*/ + /*100*/ {false, true, true, false, FixpipeParamsType::NONE, false, false, false, true, false, false, false, false, false, false}, + /*200*/ {false, true, true, false, FixpipeParamsType::NONE, true, false, false, false, false, false, false, false, false, false}, + /*220*/ {true, false, false, true, FixpipeParamsType::V220, true, true, false, false, true, true, false, false, false, true}, + /*300*/ {true, false, true, false, FixpipeParamsType::V220, true, true, false, false, true, false, false, false, false, false}, + /*310*/ {true, false, false, true, FixpipeParamsType::V310, true, false, true, false, true, true, true, false, true, false}}; }; - } // namespace Detail } // namespace Impl } // namespace AscendC diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index b635bdc3..4a347846 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -22,7 +22,7 @@ namespace AscendC { // Match Policy with CallBack paramter template -class MatmulImpl> +class MatmulImpl>> : public MatmulImplBase { private: @@ -123,12 +123,16 @@ public: static_assert(DoMatmulMDL(MM_CFG), "NBuffer33MatmulPolicy only support MDL config."); MATMUL_MODULE(Scheduler)->Schedule(ubCmatrix, enAtomic); } else { - int64_t dstOffset = 0; - while (BASE_MODULE::Iterate(false, ubCmatrix[dstOffset])) { - if constexpr (PhyPosIsL0C(C_TYPE::pos)) { + if constexpr (PhyPosIsL0C(C_TYPE::pos)) { + int64_t dstOffset = 0; + while (BASE_MODULE::Iterate(false, ubCmatrix[dstOffset])) { dstOffset += MATMUL_MODULE(Scheduler)->GetL0cOffset(); + BASE_MODULE::GetTensorC(ubCmatrix, enAtomic); + } + } else { + while (BASE_MODULE::Iterate(false)) { + BASE_MODULE::GetTensorC(ubCmatrix, enAtomic); } - BASE_MODULE::GetTensorC(ubCmatrix, enAtomic); } } } diff --git a/impl/matmul/matmul_impl_base.h b/impl/matmul/matmul_impl_base.h index 0fc84e70..1962a375 100644 --- a/impl/matmul/matmul_impl_base.h +++ b/impl/matmul/matmul_impl_base.h @@ -47,6 +47,7 @@ class MatmulImplBase , MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulSubBlockInfo) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulCrossCoreSync) , MATMUL_IMPORT_MODULE_PRIVATE(LocalWorkspace) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulUserDefineInfo) , MATMUL_IMPORT_MODULE_PRIVATE(LoadToA2) @@ -71,7 +72,7 @@ public: using CType = C_TYPE; using BiasType = BIAS_TYPE; private: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -88,7 +89,8 @@ public: __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false); __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false); __aicore__ inline void SetBias(const GlobalTensor& biasGlobal); - __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr); + template + __aicore__ inline void SetSelfDefineData(const T dataPtr); __aicore__ inline void SetSparseIndex(const GlobalTensor& indexGlobal); __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr); __aicore__ inline void SetQuantScalar(const uint64_t quantScalar); @@ -163,6 +165,7 @@ public: MATMUL_ALLOW_USING_PRIVATE(MatmulSubBlockInfo); MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo); + MATMUL_ALLOW_USING_PRIVATE(MatmulCrossCoreSync); MATMUL_ALLOW_USING_PRIVATE(LocalWorkspace); MATMUL_ALLOW_USING_PRIVATE(MatmulUserDefineInfo); MATMUL_ALLOW_USING_PRIVATE(CopyBiasIn); @@ -192,15 +195,18 @@ public: using DataCopyUtils = typename AscendC::Conditional::type; template - using DataCopyWrapper = typename AscendC::Conditional::type; + using DataCopyWrapper = + typename AscendC::Conditional::type; using CallBack = MM_CB; private: - template friend struct DfxProxy; + template friend struct DfxProxy; using IMPL = MatmulImplBase; MATMUL_USE_MODULE(CopyCubeInA); MATMUL_USE_MODULE(CopyCubeInB); + MATMUL_USE_MODULE(CubeInBufferA); + MATMUL_USE_MODULE(CubeInBufferB); MATMUL_USE_MODULE(LocalWorkspace); MATMUL_USE_MODULE(PartialOutUtils); @@ -208,6 +214,7 @@ private: MATMUL_USE_MODULE(CopyCubeOut); MATMUL_USE_MODULE(Scheduler); MATMUL_USE_MODULE(BiasScheduler); + MATMUL_USE_MODULE(C1Buffer); MATMUL_USE_MODULE(MLoop); MATMUL_USE_MODULE(NLoop); MATMUL_USE_MODULE(KLoop); @@ -217,6 +224,7 @@ private: MATMUL_USE_MODULE(MatmulShapeInfo); MATMUL_USE_MODULE(MatmulAntiQuantProcessor); MATMUL_USE_MODULE(MatmulSubBlockInfo); + MATMUL_USE_MODULE(MatmulCrossCoreSync); MATMUL_USE_MODULE(MatmulUserDefineInfo); private: @@ -225,7 +233,26 @@ private: friend __aicore__ inline void SetTPipe( MatmulImpl &mm, TPipe* tpipe); + template + friend __aicore__ inline void KfcSetIntraAId( + MatmulImpl &mm, uint8_t intraId); + + template + friend __aicore__ inline void KfcSetIntraBId( + MatmulImpl &mm, uint8_t intraId); + + template + friend __aicore__ inline MatrixL1Addr KfcGetMatrixL1Addr( + MatmulImpl &mm); + + __aicore__ inline void SetIntraAId(uint8_t intraId); + __aicore__ inline void SetIntraBId(uint8_t intraId); + protected: + __aicore__ inline MatrixL1Addr GetMatrixL1Addr(); typename Impl::Detail::MatmulParams::PARAMS var; using POLICY = MATMUL_POLICY; }; @@ -238,10 +265,34 @@ __aicore__ inline void SetTPipe(MatmulImpl +__aicore__ inline void KfcSetIntraAId(MatmulImpl &mm, uint8_t intraId) +{ + mm.SetIntraAId(intraId); +} + +template +__aicore__ inline void KfcSetIntraBId(MatmulImpl &mm, uint8_t intraId) +{ + mm.SetIntraBId(intraId); +} + +template +__aicore__ inline MatrixL1Addr KfcGetMatrixL1Addr(MatmulImpl &mm) +{ + struct MatrixL1Addr matrixL1Addr; + matrixL1Addr = mm.GetMatrixL1Addr(); + return matrixL1Addr; +} + template +template __aicore__ inline void MatmulImplBase::SetSelfDefineData( - const uint64_t dataPtr) + const T dataPtr) { MATMUL_MODULE(MatmulUserDefineInfo)->SetSelfDefineData(dataPtr); } @@ -281,6 +332,38 @@ __aicore__ inline void MatmulImplBase +__aicore__ inline void MatmulImplBase::SetIntraAId(uint8_t intraId) +{ + MATMUL_MODULE(MatmulCrossCoreSync)->SetIntraAId(intraId); +} + +template +__aicore__ inline void MatmulImplBase::SetIntraBId(uint8_t intraId) +{ + MATMUL_MODULE(MatmulCrossCoreSync)->SetIntraBId(intraId); +} + +template +__aicore__ inline MatrixL1Addr MatmulImplBase::GetMatrixL1Addr() +{ + struct MatrixL1Addr matrixL1Addr; + if constexpr (PhyPosIsUB(A_TYPE::pos)) { + matrixL1Addr.l1aAddr = MATMUL_MODULE(CubeInBufferA)->GetBufferHeadAddr(); + } + if constexpr (PhyPosIsUB(B_TYPE::pos)) { + matrixL1Addr.l1bAddr = MATMUL_MODULE(CubeInBufferB)->GetBufferHeadAddr(); + } + + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias()) { + matrixL1Addr.l1biasAddr = MATMUL_MODULE(C1Buffer)->GetBufferHeadAddr(); + } + return matrixL1Addr; +} + template __aicore__ inline uint8_t MatmulImplBase::GetSubBlockIdx() @@ -568,7 +651,6 @@ __aicore__ inline bool MatmulImplBaseScheduleOnce(enPartialSum); } - template template @@ -583,4 +665,4 @@ __aicore__ inline bool MatmulImplBaseScheduleOnce(enPartialSum); } } // namespace AscendC -#endif // _MATMUL_IMPL_BASE_H_ +#endif // _MATMUL_IMPL_BASE_H_ \ No newline at end of file -- Gitee From 1469cbc18b6d87697e2ac67f0c7b115c65a11a55 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:29:48 +0000 Subject: [PATCH 02/56] add Signed-off-by: jiangchengcheng-on --- .../feature_trait/matmul_feature_trait.h | 20 +- impl/matmul/kfc/matmul_server_impl_c310.h | 499 ++++++++++++++++++ 2 files changed, 517 insertions(+), 2 deletions(-) create mode 100644 impl/matmul/kfc/matmul_server_impl_c310.h diff --git a/impl/matmul/feature_trait/matmul_feature_trait.h b/impl/matmul/feature_trait/matmul_feature_trait.h index 39e0a117..762d912a 100644 --- a/impl/matmul/feature_trait/matmul_feature_trait.h +++ b/impl/matmul/feature_trait/matmul_feature_trait.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -15,7 +15,8 @@ #ifndef IMPL_MATMUL_FEATURE_TRAIT_MATMUL_FEATURE_TRAIT_H #define IMPL_MATMUL_FEATURE_TRAIT_MATMUL_FEATURE_TRAIT_H -#include "../utils/matmul_utils.h" +#include "../utils/matmul_config_utils.h" +#include "../utils/matmul_type_def.h" #include "matmul_chip_cap.h" namespace AscendC { @@ -82,6 +83,21 @@ public: return MatmulChipCap::GetFeatures().ifSupportUserDefine; } + __aicore__ inline constexpr static bool IsSupportUBToL1Singleshape() + { + return MatmulChipCap::GetFeatures().ifSupportUBToL1Singleshape; + } + + __aicore__ inline constexpr static bool IsMmadInstrSupportAntiQuant() + { + return MatmulChipCap::GetFeatures().ifMmadInstrSupportAntiQuant; + } + + __aicore__ inline constexpr static bool IsSupportL0CToUB() + { + return MatmulChipCap::GetFeatures().ifSupportL0CToUB; + } + __aicore__ inline constexpr static bool IsSupportTrianMatmul() { return MatmulChipCap::GetFeatures().ifSupportTrianMatmul; diff --git a/impl/matmul/kfc/matmul_server_impl_c310.h b/impl/matmul/kfc/matmul_server_impl_c310.h new file mode 100644 index 00000000..e32d8531 --- /dev/null +++ b/impl/matmul/kfc/matmul_server_impl_c310.h @@ -0,0 +1,499 @@ +/* * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file matmul_server_impl_c310.h + * \brief + */ +#ifndef IMPL_MATMUL_KFC_MATMUL_SERVER_IMPL_C310_H +#define IMPL_MATMUL_KFC_MATMUL_SERVER_IMPL_C310_H + +#include "matmul_server.h" + +namespace AscendC { +#if defined(__DAV_C310__) +template +__aicore__ inline void MatmulService::Init(MSG_POS KfcMsg *msg) +{ + // C310 get tiling from special addr in ssbuf, no need to flush cacheline + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + return; + } else { + MSG_POS TilingInfo *tilingSSbuf = reinterpret_cast(GetTilingAddr(mul.GetSubBlockIdx())); + auto temp1 = ((MSG_POS uint64_t *)(&(tilingSSbuf->tCubeTiling))); + tiling_.SetTiling(&tmpTiling_); + auto temp2 = (uint64_t *)(&tmpTiling_); // need to be same with c220 + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint64_t); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + tilingSSbuf->valid = 0; // set ssbuf ok flag for next mm init + mul.Init(&tmpTiling_, nullptr); + InitL1Addr(); + } +} + +template +__aicore__ inline void MatmulService::GetOffsetSize( + MsgTmpPos MatmulConfigParams *body, KFC_Enum funID, uint32_t sync, uint64_t &offsetSize, uint32_t &enSequentialWrite, bool hasSetWorkspace) +{ + bool isIterate = (funID == KFC_Enum::MMFUN_ITERATE); + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) { + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + // On the scene of (defaultMode + iterate + async + setWorkspace + outUb) + if constexpr (ToMatmulConfig(MM_CFG).iterateMode == IterateMode::ITERATE_MODE_DEFAULT) { // for performance + if (unlikely(body->cAddr != 0 && isIterate && hasSetWorkspace && sync == 0)) { + enSequentialWrite = 1; + offsetSize = GetBaseOffsetC(enSequentialWrite, tiling_.GetBaseM(), tiling_.GetBaseN()); + } + } + } else { + offsetSize = GetBaseOffsetC(enSequentialWrite, tiling_.GetBaseM(), tiling_.GetBaseN()); + } + } else { + if constexpr (GetPhyType(C_TYPE::pos) != Hardware::UB) { + if (sync == 0) { + // For asynchronous Iterate, the offset must be used for address calculation and the size is + // baseM x baseN. + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + offsetSize = ToMatmulConfig(MM_CFG).baseMN; + } else { + offsetSize = tiling_.GetBaseM() * tiling_.GetBaseN(); + } + enSequentialWrite = 1; + } + } + } +} + +template +__aicore__ inline bool MatmulService::Iterate( + MSG_POS KfcMsg* msg, KFC_Enum funID) +{ + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + + MatmulConfigParams body; + GetMsgFromSSbuf(msg, body); + WaitAB(body); + IterateSetMessage(msg, &body); + + auto enSequentialWrite = body.enSequentialWrite; + auto enAtomic = body.enAtomic; + auto sync = body.sync; + auto enPartialSum = body.enPartialSum; + auto hasSetWorkspace = body.hasSetWorkspace; + + GlobalTensor cGlobal; + LocalTensor cLocal; + uint64_t size; + if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) { + size = ToMatmulConfig(MM_CFG).singleCoreMN; + } else { + size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + } + + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(body.cAddr), size); + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::L1 || GetPhyType(C_TYPE::pos) == Hardware::UB) { + cLocal = GetLocalTensor(body.cAddr, size); + } + + // calulate offset + uint64_t offset = 0; + uint64_t offsetSize = 0; + GetOffsetSize(&body, funID, sync, offsetSize, enSequentialWrite, hasSetWorkspace); + + // excute iterate + bool isFirstIterate = true; + TRACE_START(TraceId::MatMul_CALC); + // Asynchronous and configure the workspace + bool isIterate = (funID == KFC_Enum::MMFUN_ITERATE); + while (mul.Iterate(enPartialSum)) { + // Process iterateAll and iterate together, save code to decrease icachemiss + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + if (isIterate && (msg->body.cAddr == 0 || !hasSetWorkspace)) { + enPartialSum_ = enPartialSum; + isSyncIterate_ = sync; + TRACE_STOP(TraceId::MatMul_CALC); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare && IsBasic(ToMatmulConfig(MM_CFG)) && + (GetPhyType(C_TYPE::pos) == Hardware::GM)) { + if (sync == 0) { + mul.End(); + } + } + return false; // The queue is not switched, and no message needs to be returned. + } + } + + // On the scene of (defaultMode + iterate + async + setWorkspace + outUb) + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB && + ToMatmulConfig(MM_CFG).iterateMode == IterateMode::ITERATE_MODE_DEFAULT) { + if (unlikely(isIterate && body.cAddr != 0 && hasSetWorkspace && sync == 0)) { + mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); + offset += offsetSize; + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + continue; + } + } + + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) { + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::L1) { + mul.GetTensorC(cLocal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } else if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + if (isFirstIterate) { + CrossCoreWaitFlag(GetIntraFlagId( + instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag( + instID + static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + INTRA_NUM); + } + isFirstIterate = false; + } + mul.GetTensorC(cLocal, (uint8_t)(enAtomic), enSequentialWrite); + } else { + mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } + if constexpr (GetPhyType(C_TYPE::pos) != Hardware::UB) { + offset += offsetSize; + } + } + } + + // Now release UB, actual is gm, gm->l1 + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) { + if (sync || body.waitIterateAll) { + ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); + IterNotify(); + } + } + mul.End(); + TRACE_STOP(TraceId::MatMul_CALC); + return true; +} + +template +__aicore__ inline void MatmulService::StartIterateNBatch( + MsgTmpPos MatmulConfigParams* body, uint32_t &cntIterator) +{ + return; +} + +template +__aicore__ inline bool MatmulService::GetTensorC(MSG_POS KfcMsg* msg) +{ + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + uint64_t size; + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + size = ToMatmulConfig(MM_CFG).baseMN; + } else { + size = tiling_.GetBaseM() * tiling_.GetBaseN(); + } + + uint32_t tmpBody = *(reinterpret_cast(&(msg->body))); + uint8_t enAtomic = (uint8_t)(tmpBody & 0xff); + bool enSequentialWrite = (tmpBody >> (sizeof(uint8_t) * ONE_BYTE_BIT_SIZE)) & 0x1; + + if constexpr (NeedTransitByGm(C_TYPE::pos)) { + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(msg->body.cAddr), size); + mul.GetTensorC(cGlobal, enAtomic, enSequentialWrite); + } else { + const auto& cLocal = GetLocalTensor(msg->body.cAddr, size); + // C310 direct l0c->ub need to sync pipe with AIV + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + CrossCoreWaitFlag( + GetIntraFlagId(instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag(instID + static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + + INTRA_NUM); + } + } + mul.GetTensorC(cLocal, enAtomic, enSequentialWrite); + } + + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare && IsBasic(ToMatmulConfig(MM_CFG)) && + (GetPhyType(C_TYPE::pos) == Hardware::GM)) { + // L0c accumulation and async no need to wait copy out, so no need to set fixpipe + if (isSyncIterate_) { + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + CrossCoreSetFlag(instID + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP) + + INTRA_NUM); + } + } else { + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(instID + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP) + + INTRA_NUM); + } + } + + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::GM || GetPhyType(C_TYPE::pos) == Hardware::UB) { + if constexpr (!(IsBasic(ToMatmulConfig(MM_CFG)))) { + if (isSyncIterate_) { + return false; + } + if (mul.Iterate(enPartialSum_)) { + return false; + } + } + if constexpr (!(A_TYPE::ibShare && B_TYPE::ibShare && IsBasic(ToMatmulConfig(MM_CFG)) && + (GetPhyType(C_TYPE::pos) == Hardware::GM))) { + mul.End(); + } + // No data is available, switch the message queue. + return true; + } + return false; +} + +template +__aicore__ inline void MatmulService::GetMsgFromSSbuf( + MSG_POS KfcMsg* msg, MatmulConfigParams &body) +{ + // copy msg from ssbuf to stack for higher performance, because there is no cache in ssbuf + uint64_t *ptr = reinterpret_cast(&body); + MSG_POS uint64_t *ptrMsg = reinterpret_cast(&(msg->body)); + + if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector || + (ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0 || + (ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0 || + HasScalePosition::value || HasScalePosition::value) { + RecvSSbufData<15>(ptr, ptrMsg); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + RecvSSbufData<9>(ptr, ptrMsg); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) { + RecvSSbufData<8>(ptr, ptrMsg); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) { + RecvSSbufData<7>(ptr, ptrMsg); + } else { + RecvSSbufData<4>(ptr, ptrMsg); + } +} + +template +__aicore__ inline void MatmulService::IterNotify() +{ + if constexpr (GetPhyType(C_TYPE::pos) != Hardware::L1) { + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(GetIntraFlagId(instID, + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 1U)); // 1 means sub_block 1 + } + } +} + +template +__aicore__ inline void MatmulService::InitL1Addr() +{ + // need to init first in mm impl to get l1 addr + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB || GetPhyType(B_TYPE::pos) == Hardware::UB || + PhyMxScalePosIsUB() || PhyMxScalePosIsUB() || GetPhyType(BIAS_TYPE::pos) == Hardware::UB) { + MatrixL1Addr matrixL1Addr = KfcGetMatrixL1Addr(mul); + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(mul.GetSubBlockIdx(), instID); + CopyL1Addr2SSBUF(matmulL1AddrMsg, &matrixL1Addr); + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + matrixL1Addr = KfcGetMatrixL1Addr(mul); + matmulL1AddrMsg = (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(1, instID); + CopyL1Addr2SSBUF(matmulL1AddrMsg, &matrixL1Addr); + } + } +} + +template +__aicore__ inline void MatmulService::CopyL1Addr2SSBUF( + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg_, MatrixL1Addr *matrixL1Addr_) +{ + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB) { + matmulL1AddrMsg_->l1aAddr = matrixL1Addr_->l1aAddr; + } + if constexpr (GetPhyType(B_TYPE::pos) == Hardware::UB) { + matmulL1AddrMsg_->l1bAddr = matrixL1Addr_->l1bAddr; + } + if constexpr (GetPhyType(BIAS_TYPE::pos) == Hardware::UB) { + matmulL1AddrMsg_->l1biasAddr = matrixL1Addr_->l1biasAddr; + } + if constexpr (PhyMxScalePosIsUB()) { + matmulL1AddrMsg_->l1aScaleAddr = matrixL1Addr_->l1aScaleAddr; + } + if constexpr (PhyMxScalePosIsUB()) { + matmulL1AddrMsg_->l1bScaleAddr = matrixL1Addr_->l1bScaleAddr; + } + + matmulL1AddrMsg_->valid = 1; +} + +template +__aicore__ inline void MatmulService::WaitAB( + MatmulConfigParams &body) +{ + // Make sure data copy done from UB->GM + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::GM || GetPhyType(B_TYPE::pos) == Hardware::GM || + GetPhyType(BIAS_TYPE::pos) == Hardware::GM) { + if constexpr (!A_TYPE::ibShare && !B_TYPE::ibShare) { + // GM->L1 wait UB->GM + CrossCoreWaitFlag( + GetIntraFlagId(0, static_cast(CUBE_WAIT_INTRA_Enum::GM_L1_UB_GM), mul.GetSubBlockIdx())); + } + } + + // iterateNorm & iterateDefault only set flag at firstIter, UB in only + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) && + (GetPhyType(A_TYPE::pos) == Hardware::UB || GetPhyType(B_TYPE::pos) == Hardware::UB || + PhyMxScalePosIsUB() || PhyMxScalePosIsUB() || GetPhyType(BIAS_TYPE::pos) == Hardware::UB)) { + if (!body.isFirstIter) { + return; + } + } + + // First iterate need let aiv know, its ok to copy UB->L1 + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB || GetPhyType(B_TYPE::pos) == Hardware::UB || + PhyMxScalePosIsUB() || PhyMxScalePosIsUB() || GetPhyType(BIAS_TYPE::pos) == Hardware::UB) { + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + // Only one mm obj in server + constexpr uint16_t l1SetIdV0 = static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB); + constexpr uint16_t l1SetIdV1 = static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB) + INTRA_NUM; + CrossCoreSetFlag(l1SetIdV0); + CrossCoreSetFlag(l1SetIdV1); + } else { + CrossCoreSetFlag( + GetIntraFlagId(0, static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB), mul.GetSubBlockIdx())); + } + } +} + +template +__aicore__ inline void MatmulService::SetTensorScaleA( + MatmulConfigParams &body) +{ + const bool isTransScaleA = body.quantMode & 0b01; + if constexpr (PhyMxScalePosIsGM()) { + GlobalTensor scaleAGlobal; + scaleAGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ float8_e8m0_t *>(body.quantAddr), 0); + mul.SetTensorScaleA(scaleAGlobal, isTransScaleA); + } else if constexpr (PhyMxScalePosIsUB() || PhyMxScalePosIsL1()) { + const auto &scaleALocal = GetLocalTensor(body.quantAddr, 0); + mul.SetTensorScaleA(scaleALocal, isTransScaleA); + return; + } +} + +template +__aicore__ inline void MatmulService::SetTensorScaleB( + MatmulConfigParams &body) +{ + const bool isTransScaleB = (body.quantMode >> 1) & 0b01; + if constexpr (PhyMxScalePosIsGM()) { + GlobalTensor scaleBGlobal; + scaleBGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ float8_e8m0_t *>(body.quantScalar), 0); + mul.SetTensorScaleB(scaleBGlobal, isTransScaleB); + } else if constexpr (PhyMxScalePosIsUB() || PhyMxScalePosIsL1()) { + const auto &scaleBLocal = GetLocalTensor(body.quantScalar, 0); + mul.SetTensorScaleB(scaleBLocal, isTransScaleB); + return; + } +} + +template +__aicore__ inline bool MatmulService::IterateBatch(MSG_POS KfcMsg* msg) +{ + if constexpr (A_TYPE::layout == LayoutMode::NONE) { + return true; + } + + MatmulConfigParams tmpBody; + GetMsgFromSSbuf(msg, tmpBody); + auto *body = &tmpBody; + WaitAB(*body); + +#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 + if (body->setQuant == 1) { + ASSERT(body->quantMode != 1); // scalar mode is not supported for quantization parameters in + // Batch MM + } +#endif + + IterateSetMessage(msg, body); + uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + + if constexpr (NeedTransitByGm(C_TYPE::pos)) { + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(body->cAddr), size); + mul.IterateBatch(cGlobal,body->enPartialSum, (uint8_t)(body->enAtomic),body->enSequentialWrite, + body->matrixStrideA, body->matrixStrideB, body->matrixStrideC); + } + + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + LocalTensor cLocal = GetLocalTensor(body->cAddr, size); + CrossCoreWaitFlag( + GetIntraFlagId(instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag(instID + static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + + INTRA_NUM); + } + mul.IterateBatch(cLocal,body->enPartialSum, (uint8_t)(body->enAtomic), body->enSequentialWrite, + body->matrixStrideA, body->matrixStrideB, body->matrixStrideC); + } + if (body->sync || body->waitIterateBatch) { + IterNotify(); + } + return true; +} + +template +__aicore__ inline bool MatmulService::IterateNBatch(MSG_POS KfcMsg* msg) +{ + if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { + return true; + } + MatmulConfigParams tmpBody; + GetMsgFromSSbuf(msg, tmpBody); + auto *body = &tmpBody; + WaitAB(*body); +#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 + if (msg->body.setQuant == 1) { + ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in + // Batch MM + } +#endif + uint32_t cntIterator = 0; + StartIterateNBatch(body, cntIterator); + if (msg->body.sync || msg->body.waitIterateBatch) { + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + } else if (cntIterator >= INC_PROCESS_CHECK) { + CrossCoreSetFlag( + GetIntraFlagId(instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), mul.GetSubBlockIdx())); + } + return true; +} +#endif +} // namespace AscendC +#endif // __MATMUL_SERVER_IMPL_C310_H__ \ No newline at end of file -- Gitee From 4e303a77065dc91d5de0c63d73c8e43b942178a3 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:31:02 +0000 Subject: [PATCH 03/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/kfc/matmul_server_aux.h | 85 ++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/impl/matmul/kfc/matmul_server_aux.h b/impl/matmul/kfc/matmul_server_aux.h index fa780c8e..bff49e71 100644 --- a/impl/matmul/kfc/matmul_server_aux.h +++ b/impl/matmul/kfc/matmul_server_aux.h @@ -16,6 +16,9 @@ #define IMPL_MATMUL_KFC_MATMUL_SERVER_AUX_H #include "matmul_server_impl.h" +#if defined(__DAV_C310__) +#include "matmul_server_impl_c310.h" +#endif namespace AscendC { @@ -188,12 +191,21 @@ public: cubeObj.cubeObj[0].mul.ClearBias(); } } +#if defined(__DAV_C310__) + template + __aicore__ inline void SetSelfDefineData(T dataPtr){ + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { + cubeObj.cubeObj[0].mul.SetSelfDefineData(dataPtr); + } + } +#else __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) { if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { cubeObj.cubeObj[0].mul.SetSelfDefineData(dataPtr); } } +#endif __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) { if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { @@ -241,21 +253,45 @@ public: bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) { if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if defined(__DAV_C310__) + WaitAB(); + cubeObj.cubeObj[0].mul.IterateAll(gm, enAtomic, enSequentialWrite, waitIterateAll, fakeMsg); + if (sync || waitIterateAll) { + IterNotify(); + } +#else constexpr uint16_t eventID = 9U; WaitEvent(eventID); cubeObj.cubeObj[0].mul.IterateAll(gm, enAtomic, enSequentialWrite, waitIterateAll, fakeMsg); if (sync || waitIterateAll){ NotifyEvent(cubeObj.cubeObj[0].instID); } +#endif cubeObj.cubeObj[0].mul.End(); } } template - __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) + __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false) { if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { #if (__CCE_AICORE__ == 220) ASSERT("IterateAll localTensor not support when enableMixDualMaster is enabled"); +#endif +#if defined(__DAV_C310__) + WaitAB(); + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + CrossCoreWaitFlag(GetIntraFlagId( + cubeObj.cubeObj[0].instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), 0)); + CrossCoreWaitFlag(GetIntraFlagId( + cubeObj.cubeObj[0].instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), 1)); + } + cubeObj.cubeObj[0].mul.IterateAll(ubCmatrix, enAtomic); + if (sync || waitIterateAll) { + IterNotify(); + } + cubeObj.cubeObj[0].mul.End(); + #endif } } @@ -369,6 +405,53 @@ public: static_assert(!isTurnOnDebug, "Debug is not supported!"); } } +#if defined(__DAV_C310__) + __aicore__ inline void SetTensorScaleA(const GlobalTensor &a, bool isTransposeScaleA = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleA not support when enableMixDualMaster is enabled."); + } + __aicore__ inline void SetTensorScaleA(const LocalTensor &a, bool isTransposeScaleA = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleA not support when enableMixDualMaster is enabled."); + } + __aicore__ inline void SetTensorScaleB(const GlobalTensor &b, bool isTransposeScaleB = true) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleB not support when enableMixDualMaster is enabled."); + } + __aicore__ inline void SetTensorScaleB(const LocalTensor &b, bool isTransposeScaleB = true) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleB not support when enableMixDualMaster is enabled."); + } + constexpr static auto CONFIG = ToMatmulConfig(MM_CFG); +private: + __aicore__ inline void WaitAB() + { + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB || GetPhyType(B_TYPE::pos) == Hardware::UB || + PhyMxScalePosIsUB() || PhyMxScalePosIsUB() || GetPhyType(BIAS_TYPE::pos) == Hardware::UB) { + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + constexpr uint16_t l1SetIdV0 = static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB); + constexpr uint16_t l1SetIdV1 = static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB) + INTRA_NUM; + CrossCoreSetFlag(l1SetIdV0); + CrossCoreSetFlag(l1SetIdV1); + } + } + } + __aicore__ inline void IterNotify() + { + if constexpr (GetPhyType(C_TYPE::pos) != Hardware::L1) { + CrossCoreSetFlag( + GetIntraFlagId(cubeObj.cubeObj[0].instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 0U)); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(GetIntraFlagId(cubeObj.cubeObj[0].instID, + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 1U)); // 1 means sub_block 1 + } + } + } +#endif }; // Match Policy with CallBack paramter -- Gitee From 97fd608b697f418a074809239855f6e75a9ba1d3 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:31:44 +0000 Subject: [PATCH 04/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/kfc/matmul_server.h | 263 ++++++++++++++++++-------- impl/matmul/kfc/matmul_server_impl.h | 212 ++++++++++++--------- impl/matmul/kfc/matmul_server_utils.h | 39 ++++ 3 files changed, 343 insertions(+), 171 deletions(-) diff --git a/impl/matmul/kfc/matmul_server.h b/impl/matmul/kfc/matmul_server.h index c384cd49..7161ba29 100644 --- a/impl/matmul/kfc/matmul_server.h +++ b/impl/matmul/kfc/matmul_server.h @@ -16,6 +16,7 @@ #define IMPL_MATMUL_KFC_MATMUL_SERVER_H #include "matmul_server_utils.h" +#include "../utils/matmul_config_utils.h" namespace AscendC { @@ -28,23 +29,30 @@ class MatmulService { using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; +#if defined(__DAV_C310__) + using IMPL = MatmulImpl; + using UserDefDataType = typename MATMUL_POLICY::UserDefDataType; +#endif public: __aicore__ inline MatmulService() {} - __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace) + __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KFC_COMM_SERVER_PTR kfc, int32_t instID, GM_ADDR workspace) { ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server"); this->instID = instID; if constexpr (!ToMatmulConfig(MM_CFG).enableMixDualMaster) { ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server"); - ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); this->kfcCommSrv = kfc; - this->workspace = workspace; mul.SetSubBlockIdx(kfcCommSrv->subBlockID); +#if !defined(__DAV_C310__) + ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); + this->workspace = workspace; if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { msgAux.msg0.setOrgShape = false; msgAux.msg1.setOrgShape = false; } +#endif this->devEvtID = instID; + // A and B both enable ibShare no need to use cache if constexpr ((A_TYPE::ibShare || B_TYPE::ibShare) && !(A_TYPE::ibShare && B_TYPE::ibShare)) { if (kfcCommSrv->subBlockID == 0) { gCache.Init(); @@ -57,19 +65,24 @@ public: if constexpr (IsSameTypeV) { tiling_.SetTiling((TCubeTiling *)tiling); mul.Init(tiling_.GetTiling(), nullptr); +#if defined(__DAV_C310__) + InitL1Addr(); +#endif } else if (tiling) { tiling_.SetTiling((TCubeTiling *)tiling); mul.Init(tiling_.GetTiling(), nullptr); +#if defined(__DAV_C310__) + InitL1Addr(); +#endif } } - - __aicore__ inline void Init(__gm__ KfcMsg* msg); + __aicore__ inline void Init(MSG_POS KfcMsg* msg); __aicore__ inline void SetSubBlockIdx(uint8_t idx) { mul.SetSubBlockIdx(idx); } - __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg); + __aicore__ inline void SetOrgShape(MSG_POS KfcMsg* msg); __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg) { if (msg->body.setTail) { @@ -77,48 +90,80 @@ public: } } - __aicore__ inline void SetTail(__gm__ KfcMsg* msg) + __aicore__ inline void SetTail(MsgTmpPos MatmulConfigParams* body) { - if (msg->body.setTail) { - mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK); + if (body->setTail) { + mul.SetTail(body->singleM, body->singleN, body->singleK); } } - __aicore__ inline void SetHF32(__gm__ KfcMsg* msg) + __aicore__ inline void SetHF32(MSG_POS KfcMsg* msg) { mul.SetHF32(static_cast(msg->body.enHF32), static_cast(msg->body.hf32TransMode)); } - __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg); - __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset); - __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg) + __aicore__ inline void SetTensorA(MsgTmpPos MatmulConfigParams* body); + __aicore__ inline void SetTensorA(MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset); + __aicore__ inline void SetQuantVector(MsgTmpPos MatmulConfigParams* body) { - if (!msg->body.setQuant) { + if (!body->setQuant) { return; } - int quantMode = msg->body.quantMode; + int quantMode = body->quantMode; if (quantMode == 1) { - uint64_t quantScalar = msg->body.quantScalar; + uint64_t quantScalar = body->quantScalar; mul.SetQuantScalar(quantScalar); } else if (quantMode == 2) { - const uint64_t size = static_cast(msg->body.quantSize); + const uint64_t size = static_cast(body->quantSize); GlobalTensor quantGlobal; - quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size); + quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(body->quantAddr), size); mul.SetQuantVector(quantGlobal); } } - - __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg) + __aicore__ inline void SetBatchNum(MsgTmpPos MatmulConfigParams* body) { if constexpr (A_TYPE::layout == LayoutMode::NONE) { return; } - if (!msg->body.setBatch) { + if (!body->setBatch) { + return; + } + mul.SetBatchNum(body->batchA, body->batchB); + } + +#if defined(__DAV_C310__) + __aicore__ inline void SetUserDefInfo(MSG_POS KfcMsg* msg) + { + if (msg->userCustomData == 1) { + mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); return; } - mul.SetBatchNum(msg->body.batchA, msg->body.batchB); + UserDefDataType userData; + uint32_t *ptr = reinterpret_cast(&userData); + MSG_POS uint32_t *ptrMsg = reinterpret_cast(&(msg->body)); + for (int i = 0; i < sizeof(UserDefDataType) / sizeof(uint32_t); i++) { + *(ptr + i) = *(ptrMsg + i); + } + + mul.SetSelfDefineData(userData); } + __aicore__ inline void SetSelfDefineData(MSG_POS KfcMsg* msg, MsgTmpPos MatmulConfigParams* body) + { + if (body->userInfoType == 0) { + return; + } + UserDefDataType userData; + uint32_t *ptr = reinterpret_cast(&userData); + if constexpr (sizeof(UserDefDataType) == 4) { + *ptr = msg->userCustomData; + } else if constexpr (sizeof(UserDefDataType) == 8) { + *ptr = msg->userCustomData; + *(ptr + 1) = body->userCustomData; + } + mul.SetSelfDefineData(userData); + } +#else __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg) { GlobalTensor msgGlobal; @@ -136,18 +181,20 @@ public: { mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); } +#endif - __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg); - __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset); - __aicore__ inline void SetBias(__gm__ KfcMsg* msg); - __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset); - __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg); + __aicore__ inline void SetTensorB(MsgTmpPos MatmulConfigParams* body); + __aicore__ inline void SetTensorB(MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset); + __aicore__ inline void SetBias(MsgTmpPos MatmulConfigParams* body); + __aicore__ inline void SetBias(MsgTmpPos MatmulConfigParams* body, const uint64_t offset); + __aicore__ inline bool GetTensorC(MSG_POS KfcMsg* msg); __aicore__ inline uint16_t GetInstID() { return instID; } - __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg) + __aicore__ inline void IterateSetMessage(MSG_POS KfcMsg* msg, MsgTmpPos MatmulConfigParams* body) { +#if !defined(__DAV_C310__) if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) { mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, @@ -157,52 +204,69 @@ public: msgAux.msg1.orgKb, msgAux.msg1.orgKc); } } - if (msg->body.isFirstIter) { - SetTensorA(msg); - SetTensorB(msg); +#endif + if (body->isFirstIter) { + SetTensorA(body); + SetTensorB(body); if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { - SetBias(msg); + SetBias(body); } if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) { - SetTail(msg); + SetTail(body); } if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector) { - SetQuantVector(msg); + SetQuantVector(body); } if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) || ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { - SetBatchNum(msg); + SetBatchNum(body); + } + } +#if defined(__DAV_C310__) + if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) { + if (body->setOrgShape) { + mul.SetOrgShape(body->orgM, body->orgN, body->orgKa, body->orgKb, body->orgKc); } } + if constexpr (HasScalePosition::value) { + SetTensorScaleA(*body); + } + if constexpr (HasScalePosition::value) { + SetTensorScaleB(*body); + } + if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) { + SetSelfDefineData(msg, body); + } +#else if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) { SetSelfDefineData(msg); } +#endif } } - __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize, + __aicore__ inline void IterateSetMessage(MsgTmpPos MatmulConfigParams* body, const uint64_t batchASize, const uint64_t batchBSize, const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0) { - if (msg->body.isFirstIter) { - SetTensorA(msg, batchASize, offsetA); - SetTensorB(msg, batchBSize, offsetB); - SetBias(msg, offsetBias); - SetTail(msg); - SetQuantVector(msg); + if (body->isFirstIter) { + SetTensorA(body, batchASize, offsetA); + SetTensorB(body, batchBSize, offsetB); + SetBias(body, offsetBias); + SetTail(body); + SetQuantVector(body); if constexpr (A_TYPE::layout != LayoutMode::NONE) { - SetBatchNum(msg); + SetBatchNum(body); } } } - - __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg); - __aicore__ inline void StartIterateNBatch(__gm__ KfcMsg* msg, uint32_t &cntIterator); - __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg); - __aicore__ inline void GetOffsetSize(__gm__ KfcMsg* msg, KFC_Enum funID, uint32_t sync, - uint64_t &offsetSize, uint32_t &enSequentialWrite); - __aicore__ inline bool StartIterate(__gm__ KfcMsg* msg, KFC_Enum funID, uint32_t sync, uint32_t &cntIterator); - __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID); + __aicore__ inline bool IterateBatch(MSG_POS KfcMsg* msg); + __aicore__ inline void StartIterateNBatch(MsgTmpPos MatmulConfigParams* body, uint32_t &cntIterator); + __aicore__ inline bool IterateNBatch(MSG_POS KfcMsg* msg); + __aicore__ inline void GetOffsetSize(MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, + uint64_t &offsetSize, uint32_t &enSequentialWrite, bool hasSetWorkspace = false); + __aicore__ inline bool StartIterate(MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, uint32_t &cntIterator); + __aicore__ inline bool Iterate(MSG_POS KfcMsg* msg, KFC_Enum funID); __aicore__ inline void QuantCacheRefresh(__gm__ KfcMsg* msg) { if constexpr (((IsSameType::value || IsSameType::value) && @@ -217,22 +281,13 @@ public: } } - __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID) + __aicore__ inline bool IterateIntraBlockPartSum(MSG_POS KfcMsg* msg, KFC_Enum funID) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; } - if constexpr (((IsSameType::value || IsSameType::value) && - IsSameType::value) || - ((IsSameType::value || IsSameType::value) && - IsSameType::value) || - (IsSameType::value && (IsSameType::value || - IsSameType::value))) { - GlobalTensor msgGlobal; - msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); - DataCacheCleanAndInvalid(msgGlobal); - } - IterateSetMessage(msg); + QuantCacheRefresh(msg); + IterateSetMessage(msg, &(msg->body)); if (mul.GetSubBlockIdx() == 0) { return true; } @@ -281,9 +336,26 @@ public: template __aicore__ inline bool SkipMsg(KFC_Enum funID, bool &freeMsg, int &lastMsgId, const int subBlockID) { +#if defined(__DAV_C310__) + if constexpr (enableHardPoll == 1) { + return false; + } + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (lastMsgId == subBlockID) { + freeMsg = false; + return true; + } else if (subBlockID == 1) { + lastMsgId = 1; + return true; + } + lastMsgId = 0; + return false; + } +#else if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { return false; } +#endif if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || ToMatmulConfig(MM_CFG).intraBlockPartSum) { if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { if (lastMsgId == subBlockID) { @@ -301,22 +373,37 @@ public: template __aicore__ inline bool LockMsgQueue(KFC_Enum funID, bool &freeMsg, int &lastMsgId, const int subBlockID, - __gm__ KfcMsg *msg = nullptr) + MSG_POS KfcMsg *msg = nullptr) { +#if defined(__DAV_C310__) + if constexpr (!(A_TYPE::ibShare && B_TYPE::ibShare)) { + if (funID == KFC_Enum::MMFUN_ITERATE) { + if (msg->body.cAddr == 0) { + return true; + } + } + } + if constexpr (enableHardPoll == 1) { + return true; + } +#else if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { return true; } +#endif return false; } - __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID) + __aicore__ inline bool Process(MSG_POS KfcMsg* msg, KFC_Enum funID) { if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) || ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) { if ((static_cast(funID) & static_cast(KFC_Enum::MMFUN_MASK)) == static_cast(KFC_Enum::MMFUN_MASK)) { if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { +#if !defined(__DAV_C310__) return IterateIntraBlockPartSum(msg, funID); +#endif } else { return Iterate(msg, funID); } @@ -338,12 +425,14 @@ public: return GetTensorC(msg); } } +#if !defined(__DAV_C310__) if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) { if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) { SetOrgShape(msg); return true; } } +#endif if constexpr (ToMatmulConfig(MM_CFG).enableInit) { if (funID == KFC_Enum::MMFUN_INIT) { Init(msg); @@ -356,9 +445,11 @@ public: return IterateNBatch(msg); } } - if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { - SetUserDefInfo(msg); - return true; + if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) { + if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { + SetUserDefInfo(msg); + return true; + } } if (funID == KFC_Enum::MMFUN_SET_HF32) { SetHF32(msg); @@ -368,25 +459,35 @@ public: return true; } - template __aicore__ LocalTensor GetTscmTensor(uint64_t addr, const uint64_t size) + template + __aicore__ LocalTensor GetLocalTensor(uint64_t addr, const uint64_t size) { - LocalTensor scmLocal; - TBuffAddr scmTbuf; - scmTbuf.logicPos = (uint8_t)(TPosition::TSCM); - scmTbuf.dataLen = size * sizeof(DstT); - scmTbuf.bufferAddr = addr; + LocalTensor localTensor; + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(Tpos); + tbufOutTmp.bufferAddr = addr; #if ASCENDC_CPU_DEBUG - scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr; + tbufOutTmp.dataLen = size * sizeof(Dtype); + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(Tpos)) + addr; #endif - scmLocal.SetAddr(scmTbuf); - return scmLocal; + localTensor.SetAddr(tbufOutTmp); + return localTensor; } +#if defined(__DAV_C310__) + __aicore__ inline void GetMsgFromSSbuf(MSG_POS KfcMsg* msg, MatmulConfigParams &body); + __aicore__ inline void InitL1Addr(); + __aicore__ inline void CopyL1Addr2SSBUF(MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg_, MatrixL1Addr *matrixL1Addr_); + __aicore__ inline void WaitAB(MatmulConfigParams &body); + __aicore__ inline void IterNotify(); + __aicore__ inline void SetTensorScaleA(MatmulConfigParams &body); + __aicore__ inline void SetTensorScaleB(MatmulConfigParams &body); +#endif public: MatmulImpl mul; private: GM_ADDR workspace; - KfcCommServer* kfcCommSrv; + KFC_COMM_SERVER_PTR kfcCommSrv; MatmulTiling tiling_; TCubeTiling tmpTiling_; // for compatible with init interface typename IBShareCache()>::ShareCache gCache; @@ -395,6 +496,10 @@ public: uint16_t instID; private: uint16_t devEvtID; +#if defined(__DAV_C310__) + uint8_t enPartialSum_; + uint8_t isSyncIterate_; +#endif }; } // namespace AscendC -#endif // __MATMUL_SERVER_H__ +#endif // __MATMUL_SERVER_H__ \ No newline at end of file diff --git a/impl/matmul/kfc/matmul_server_impl.h b/impl/matmul/kfc/matmul_server_impl.h index 79f833dd..8a4099a7 100644 --- a/impl/matmul/kfc/matmul_server_impl.h +++ b/impl/matmul/kfc/matmul_server_impl.h @@ -18,10 +18,10 @@ #include "matmul_server.h" namespace AscendC { - +#if !defined(__DAV_C310__) template -__aicore__ inline void MatmulService::Init(__gm__ KfcMsg* msg) +__aicore__ inline void MatmulService::Init(MSG_POS KfcMsg* msg) { if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { return; @@ -49,7 +49,7 @@ __aicore__ inline void MatmulService -__aicore__ inline void MatmulService::SetOrgShape(__gm__ KfcMsg* msg) +__aicore__ inline void MatmulService::SetOrgShape(MSG_POS KfcMsg* msg) { if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { if (mul.GetSubBlockIdx() == 0) { @@ -72,16 +72,19 @@ __aicore__ inline void MatmulServiceorgShape.orgKc); } } +#endif template -__aicore__ inline void MatmulService::SetTensorA(__gm__ KfcMsg* msg) +__aicore__ inline void MatmulService::SetTensorA(MsgTmpPos MatmulConfigParams* body) { - if (!msg->body.setTensorA) +#if !defined(__DAV_C310__) + if (!body->setTensorA) return; +#endif if constexpr (A_TYPE::format == CubeFormat::SCALAR) { SrcAT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr)); + auto temp1 = reinterpret_cast(&(body->aAddr)); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) { @@ -90,28 +93,37 @@ __aicore__ inline void MatmulServicebody.sizeAmatrix); - if constexpr (PhyPosIsL1(A_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.aAddr, size); - mul.SetTensorA(scmLocal, msg->body.isTransA); - } else { + uint64_t size = 0; +#if !defined(__DAV_C310__) + size = (uint64_t)(body->sizeAmatrix); // not defined in C310 +#endif + if constexpr (NeedTransitByGm(A_TYPE::pos)) { GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr), size); - mul.SetTensorA(aGlobal, msg->body.isTransA); + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(body->aAddr), size); + mul.SetTensorA(aGlobal, body->isTransA); + } else { +#if defined(__DAV_C310__) + // use addr to send intraId in C310 + if constexpr (PhyPosIsL1(A_TYPE::pos)) { + KfcSetIntraAId(mul, body->aAddr >> VALID_ADDR_BITS_NUM); + } +#endif + const auto& aLocal = GetLocalTensor(body->aAddr, size); + mul.SetTensorA(aLocal, body->isTransA); } } template __aicore__ inline void MatmulService::SetTensorA( - __gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset) { - if (!msg->body.setTensorA) { + if (!body->setTensorA) { return; } if constexpr (A_TYPE::format == CubeFormat::SCALAR) { SrcAT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset); + auto temp1 = reinterpret_cast(&(body->aAddr) + offset); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) { @@ -121,24 +133,26 @@ __aicore__ inline void MatmulService(msg->body.aAddr + offset, size); - mul.SetTensorA(scmLocal, msg->body.isTransA); + const auto& aLocal = GetLocalTensor(body->aAddr + offset, size); + mul.SetTensorA(aLocal, body->isTransA); } else { GlobalTensor aGlobal; - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr + offset), size); - mul.SetTensorA(aGlobal, msg->body.isTransA); + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(body->aAddr + offset), size); + mul.SetTensorA(aGlobal, body->isTransA); } } template -__aicore__ inline void MatmulService::SetTensorB(__gm__ KfcMsg* msg) +__aicore__ inline void MatmulService::SetTensorB(MsgTmpPos MatmulConfigParams* body) { - if (!msg->body.setTensorB) +#if !defined(__DAV_C310__) + if (!body->setTensorB) return; +#endif if constexpr (B_TYPE::format == CubeFormat::SCALAR) { SrcBT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr)); + auto temp1 = reinterpret_cast(&(body->bAddr)); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) { @@ -147,28 +161,37 @@ __aicore__ inline void MatmulServicebody.sizeBmatrix); - if constexpr (PhyPosIsL1(B_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.bAddr, size); - mul.SetTensorB(scmLocal, msg->body.isTransB); - } else { + uint64_t size = 0; +#if !defined(__DAV_C310__) + size = (uint64_t)(body->sizeBmatrix); // not defined in C310 +#endif + if constexpr (NeedTransitByGm(B_TYPE::pos)) { GlobalTensor bGlobal; - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr), size); - mul.SetTensorB(bGlobal, msg->body.isTransB); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(body->bAddr), size); + mul.SetTensorB(bGlobal, body->isTransB); + } else { +#if defined(__DAV_C310__) + // use addr to send intraId in C310 + if constexpr (PhyPosIsL1(B_TYPE::pos)) { + KfcSetIntraBId(mul, body->bAddr >> VALID_ADDR_BITS_NUM); + } +#endif + const auto& bLocal = GetLocalTensor(body->bAddr, size); + mul.SetTensorB(bLocal, body->isTransB); } } template __aicore__ inline void MatmulService::SetTensorB( - __gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset) { - if (!msg->body.setTensorB) { + if (!body->setTensorB) { return; } if constexpr (B_TYPE::format == CubeFormat::SCALAR) { SrcBT scalar; - auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset); + auto temp1 = reinterpret_cast(&(body->bAddr) + offset); auto temp2 = (uint8_t*)&scalar; for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) { @@ -178,28 +201,28 @@ __aicore__ inline void MatmulService(msg->body.bAddr + offset, size); - mul.SetTensorB(scmLocal, msg->body.isTransB); + const auto& bLocal = GetLocalTensor(body->bAddr + offset, size); + mul.SetTensorB(bLocal, body->isTransB); } else { GlobalTensor bGlobal; - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr + offset), size); - mul.SetTensorB(bGlobal, msg->body.isTransB); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(body->bAddr + offset), size); + mul.SetTensorB(bGlobal, body->isTransB); } } template -__aicore__ inline void MatmulService::SetBias(__gm__ KfcMsg* msg) +__aicore__ inline void MatmulService::SetBias(MsgTmpPos MatmulConfigParams* body) { - if (msg->body.setTensorBias) { + if (body->setTensorBias) { const uint64_t size = (uint64_t)tiling_.GetSingleCoreN(); - if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.biasAddr, size); - mul.SetBias(scmLocal); - } else { + if constexpr (NeedTransitByGm(BIAS_TYPE::pos)) { GlobalTensor biasGlobal; - biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(body->biasAddr), size); mul.SetBias(biasGlobal); + } else { + const auto& biasLocal = GetLocalTensor(body->biasAddr, size); + mul.SetBias(biasLocal); } } else { mul.DisableBias(); @@ -209,17 +232,17 @@ __aicore__ inline void MatmulService __aicore__ inline void MatmulService::SetBias( - __gm__ KfcMsg* msg, const uint64_t offset) + MsgTmpPos MatmulConfigParams* body, const uint64_t offset) { - if (msg->body.setTensorBias) { + if (body->setTensorBias) { const uint64_t size = (uint64_t)tiling_.GetSingleCoreN(); if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { - const auto& scmLocal = GetTscmTensor(msg->body.biasAddr + offset, size); - mul.SetBias(scmLocal); + const auto& biasLocal = GetLocalTensor(body->biasAddr + offset, size); + mul.SetBias(biasLocal); } else { GlobalTensor biasGlobal; biasGlobal.SetGlobalBuffer( - reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size); + reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(body->biasAddr + offset), size); mul.SetBias(biasGlobal); } } else { @@ -227,9 +250,10 @@ __aicore__ inline void MatmulService -__aicore__ inline bool MatmulService::GetTensorC(__gm__ KfcMsg* msg) +__aicore__ inline bool MatmulService::GetTensorC(MSG_POS KfcMsg* msg) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; @@ -241,11 +265,10 @@ __aicore__ inline bool MatmulService(msg->body.cAddr, size); - mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); + const auto& cLocal = GetLocalTensor(msg->body.cAddr, size); + mul.GetTensorC(cLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); } else { GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); } @@ -264,7 +287,7 @@ __aicore__ inline bool MatmulService -__aicore__ inline bool MatmulService::IterateBatch(__gm__ KfcMsg* msg) +__aicore__ inline bool MatmulService::IterateBatch(MSG_POS KfcMsg* msg) { if constexpr (A_TYPE::layout == LayoutMode::NONE) { return true; @@ -273,20 +296,20 @@ __aicore__ inline bool MatmulService msgGlobalTensor; msgGlobalTensor.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobalTensor); + __gm__ auto *body = &(msg->body); #if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 - if (msg->body.setQuant == 1) { - ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in + if (body->setQuant == 1) { + ASSERT(body->quantMode != 1); // scalar mode is not supported for quantization parameters in // Batch MM } #endif - IterateSetMessage(msg); + IterateSetMessage(msg, body); uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), - msg->body.enSequentialWrite, msg->body.matrixStrideA, - msg->body.matrixStrideB, msg->body.matrixStrideC); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(body->cAddr), size); + mul.IterateBatch(cGlobal, body->enPartialSum, (uint8_t)(body->enAtomic),body->enSequentialWrite, + body->matrixStrideA, body->matrixStrideB, body->matrixStrideC); // Now release UB if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || @@ -295,7 +318,7 @@ __aicore__ inline bool MatmulServiceFreeUB(msg->ubAddr); } } - if (msg->body.sync || msg->body.waitIterateBatch) { + if (body->sync || body->waitIterateBatch) { uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } @@ -305,21 +328,21 @@ __aicore__ inline bool MatmulService __aicore__ inline void MatmulService::StartIterateNBatch( - __gm__ KfcMsg* msg, uint32_t &cntIterator) + MsgTmpPos MatmulConfigParams* body, uint32_t &cntIterator) { const uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); - const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop; - uint64_t batchAOffset = tiling_.GetALayoutInfoD() * msg->body.batchA; + uint64_t singleBatchASize = (uint64_t)(body->sizeAmatrix) / body->batchLoop; + uint64_t batchAOffset = tiling_.GetALayoutInfoD() * body->batchA; if constexpr (A_TYPE::layout != LayoutMode::SBNGD) { batchAOffset = batchAOffset * tiling_.GetALayoutInfoS(); } - const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop; - uint64_t batchBOffset = tiling_.GetBLayoutInfoD() * msg->body.batchB; + uint64_t singleBatchBSize = (uint64_t)(body->sizeBmatrix) / body->batchLoop; + uint64_t batchBOffset = tiling_.GetBLayoutInfoD() * body->batchB; if constexpr (B_TYPE::layout != LayoutMode::SBNGD) { batchBOffset = batchBOffset * tiling_.GetBLayoutInfoS(); } const uint64_t batchCOffset = tiling_.GetCLayoutInfoS2(); - const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB; + const uint32_t batchC = body->batchA > body->batchB ? body->batchA : body->batchB; bool layoutGCondition = tiling_.GetCLayoutInfoG() == 1 && (tiling_.GetBLayoutInfoG() != 1 || tiling_.GetALayoutInfoG() != 1); int32_t layoutG = tiling_.GetBLayoutInfoG() > tiling_.GetALayoutInfoG() ? tiling_.GetBLayoutInfoG() : tiling_.GetALayoutInfoG(); @@ -332,18 +355,18 @@ __aicore__ inline void MatmulServicebody.batchLoop; loopIdx++) { + for (uint32_t loopIdx = 0U; loopIdx < body->batchLoop; loopIdx++) { const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T); const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T); const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T); - IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); + IterateSetMessage(body, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); GlobalTensor cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size); - mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), - msg->body.enSequentialWrite, msg->body.matrixStrideA, - msg->body.matrixStrideB, msg->body.matrixStrideC); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(body->cAddr + offset), size); + mul.IterateBatch(cGlobal, body->enPartialSum, (uint8_t)(body->enAtomic), + body->enSequentialWrite, body->matrixStrideA, + body->matrixStrideB, body->matrixStrideC); cntIterator++; - if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) { + if (cntIterator < INC_PROCESS_CHECK && (!body->sync && !body->waitIterateBatch)) { uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); NotifyEvent(eventID); } @@ -353,7 +376,7 @@ __aicore__ inline void MatmulService -__aicore__ inline bool MatmulService::IterateNBatch(__gm__ KfcMsg* msg) +__aicore__ inline bool MatmulService::IterateNBatch(MSG_POS KfcMsg* msg) { if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { return true; @@ -361,6 +384,7 @@ __aicore__ inline bool MatmulService msgGlobal; msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); DataCacheCleanAndInvalid(msgGlobal); + __gm__ auto *body = &(msg->body); #if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 if (msg->body.setQuant == 1) { ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in @@ -368,7 +392,7 @@ __aicore__ inline bool MatmulService __aicore__ inline void MatmulService::GetOffsetSize( - __gm__ KfcMsg* msg, KFC_Enum funID, uint32_t sync, uint64_t &offsetSize, uint32_t &enSequentialWrite) + MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, uint64_t &offsetSize, uint32_t &enSequentialWrite, bool hasSetWorkspace) { if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) { - ASSERT(msg->body.cAddr != 0); // The output address must be configured. + ASSERT(body->cAddr != 0); // The output address must be configured. if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0; } else { @@ -399,7 +423,7 @@ __aicore__ inline void MatmulServicebody.cAddr != 0); // The output address must be configured. + ASSERT(body->cAddr != 0); // The output address must be configured. if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0; } else { @@ -421,7 +445,7 @@ __aicore__ inline void MatmulService __aicore__ inline bool MatmulService::StartIterate( - __gm__ KfcMsg* msg, KFC_Enum funID, uint32_t sync, uint32_t &cntIterator) + MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, uint32_t &cntIterator) { uint64_t size; if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) { @@ -431,14 +455,17 @@ __aicore__ inline bool MatmulService cGlobal; - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); - const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(body->cAddr), size); + LocalTensor cLocal; + if constexpr (PhyPosIsL1(C_TYPE::pos)) { + cLocal = GetLocalTensor(body->cAddr, size); + } uint64_t offset = 0; uint64_t offsetSize = 0; - auto enSequentialWrite = msg->body.enSequentialWrite; - auto enAtomic = msg->body.enAtomic; - auto enPartialSum = msg->body.enPartialSum; - GetOffsetSize(msg, funID, sync, offsetSize, enSequentialWrite); + auto enSequentialWrite = body->enSequentialWrite; + auto enAtomic = body->enAtomic; + auto enPartialSum = body->enPartialSum; + GetOffsetSize(body, funID, sync, offsetSize, enSequentialWrite); TRACE_START(TraceId::MatMul_CALC); // Asynchronous and configure the workspace while (mul.Iterate(enPartialSum)) { @@ -451,7 +478,7 @@ __aicore__ inline bool MatmulService __aicore__ inline bool MatmulService::Iterate( - __gm__ KfcMsg* msg, KFC_Enum funID) + MSG_POS KfcMsg* msg, KFC_Enum funID) { if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; @@ -490,10 +517,10 @@ __aicore__ inline bool MatmulService(); QuantCacheRefresh(msg); - IterateSetMessage(msg); + IterateSetMessage(msg, &(msg->body)); uint32_t cntIterator = 0; auto sync = msg->body.sync; - if(!StartIterate(msg, funID, sync, cntIterator)) { + if(!StartIterate(&(msg->body), funID, sync, cntIterator)) { return false; } // Now release UB @@ -515,5 +542,6 @@ __aicore__ inline bool MatmulServiceL1, no need through gm + if (PhyPosIsUB(tPos)) { + return false; + } +#else + if (PhyPosIsUB(tPos)) { + return true; + } +#endif + if (PhyPosIsL1(tPos)) { + return false; + } + if (PhyPosIsGM(tPos)) { + return true; + } + return true; +} + +#if defined(__DAV_C310__) +template +__aicore__ inline uint64_t GetBaseOffsetC(bool enSequentialWrite, int32_t baseM, int32_t baseN) +{ + if constexpr (AscendC::ToMatmulConfig(MM_CFG).baseMN != 0) { + return (enSequentialWrite ? AscendC::ToMatmulConfig(MM_CFG).baseMN : 0); + } else { + return (enSequentialWrite ? (baseM * baseN) : 0); + } +} +// c310 msg stored in mmserver obj +#define MsgTmpPos +#else +// c220 msg stored in gm +#define MsgTmpPos __gm__ +#endif + } // namespace AscendC #endif // _MATMUL_SERVER_H_ \ No newline at end of file -- Gitee From 8c9a4f65d63b0661179dc68b45ba2dec33f90afb Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:34:28 +0000 Subject: [PATCH 05/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/param/matmul_cross_core_sync.h | 134 +++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 impl/matmul/param/matmul_cross_core_sync.h diff --git a/impl/matmul/param/matmul_cross_core_sync.h b/impl/matmul/param/matmul_cross_core_sync.h new file mode 100644 index 00000000..34660595 --- /dev/null +++ b/impl/matmul/param/matmul_cross_core_sync.h @@ -0,0 +1,134 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_cross_core_sync.h + * \brief matmul cross core sync manager + */ + +#ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_CROSS_CORE_SYNC_H +#define IMPL_MATMUL_MODULES_PARAM_MATMUL_CROSS_CORE_SYNC_H + +#include "../utils/matmul_module.h" + +namespace AscendC { +namespace Impl { +namespace Detail { + +constexpr uint16_t CROSS_CORE_SYNC_FACTOR = 16; +constexpr uint16_t CROSS_CORE_INTRA_MODE = 4; + +template +class MatmulCrossCoreSync +{ +public: + __aicore__ inline void SetIntraAId(uint8_t intraId) + {} + + __aicore__ inline void SetIntraBId(uint8_t intraId) + {} + + __aicore__ inline void WaitL1Ready() + {} + + __aicore__ inline void SetL1FinishedUse() + {} + + __aicore__ inline void End() + {} +}; + +template +class MatmulCrossCoreSync::IsSupportUBToL1Singleshape()>> +{ + MATMUL_USE_MODULE(MatmulSubBlockInfo); +public: + __aicore__ inline void SetIntraAId(uint8_t intraId) + { + intraAId_ = intraId; + } + + __aicore__ inline void SetIntraBId(uint8_t intraId) + { + intraBId_ = intraId; + } + + __aicore__ inline void WaitL1Ready() + { + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB || GetPhyType(B_TYPE::pos) == Hardware::UB || GetPhyType(BIAS_TYPE::pos) == Hardware::UB) { + if (needWaitIntra_) { + CrossCoreWaitFlag(MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() * CROSS_CORE_SYNC_FACTOR); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag(CROSS_CORE_SYNC_FACTOR); + } + } + } + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::L1 && GetPhyType(A_TYPE::srcPos) == Hardware::UB) { + if (needWaitIntra_) { + CrossCoreWaitFlag(intraAId_ + MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() * CROSS_CORE_SYNC_FACTOR); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag(intraAId_ + CROSS_CORE_SYNC_FACTOR); + } + intraAIdBefore_ = intraAId_; + } + } + if constexpr (GetPhyType(B_TYPE::pos) == Hardware::L1 && GetPhyType(B_TYPE::srcPos) == Hardware::UB) { + if (needWaitIntra_) { + CrossCoreWaitFlag(intraBId_ + MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() * CROSS_CORE_SYNC_FACTOR); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreWaitFlag(intraBId_ + CROSS_CORE_SYNC_FACTOR); + } + intraBIdBefore_ = intraBId_; + } + } + needWaitIntra_ = false; + } + + __aicore__ inline void SetL1FinishedUse() + { + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::L1 && GetPhyType(A_TYPE::srcPos) == Hardware::UB) { + if (needSetIntra_) { + CrossCoreSetFlag(intraAId_ + MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() * CROSS_CORE_SYNC_FACTOR); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(intraAId_ + CROSS_CORE_SYNC_FACTOR); + } + } + } + if constexpr (GetPhyType(B_TYPE::pos) == Hardware::L1 && GetPhyType(B_TYPE::srcPos) == Hardware::UB) { + if (needSetIntra_) { + CrossCoreSetFlag(intraBId_ + MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() * CROSS_CORE_SYNC_FACTOR); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(intraBId_ + CROSS_CORE_SYNC_FACTOR); + } + } + } + needSetIntra_ = false; + } + + __aicore__ inline void End() + { + SetL1FinishedUse(); + needWaitIntra_ = true; + needSetIntra_ = true; + } + +private: + uint8_t intraAIdBefore_ = 0; + uint8_t intraBIdBefore_ = 0; + bool needWaitIntra_ = true; + bool needSetIntra_ = true; + uint8_t intraAId_ = 0; + uint8_t intraBId_ = 0; +}; + +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_CROSS_CORE_SYNC_H -- Gitee From 8db5ccccd755f75b7b49b15de3ffe03afea34eb5 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:34:57 +0000 Subject: [PATCH 06/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/param/matmul_shape_info.h | 59 +++++++++++++++++++++++-- impl/matmul/param/matmul_shape_tiling.h | 43 +++++++++++++++--- 2 files changed, 93 insertions(+), 9 deletions(-) diff --git a/impl/matmul/param/matmul_shape_info.h b/impl/matmul/param/matmul_shape_info.h index ad7f652e..683e9066 100644 --- a/impl/matmul/param/matmul_shape_info.h +++ b/impl/matmul/param/matmul_shape_info.h @@ -23,7 +23,7 @@ namespace Impl { namespace Detail { template class MatmulShapeInfoBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; MATMUL_USE_MODULE(MatmulShapeTiling); public: @@ -241,7 +241,7 @@ protected: template class MatmulShapeInfo : public MatmulShapeInfoBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; MATMUL_USE_MODULE(MatmulShapeTiling); public: @@ -251,7 +251,7 @@ public: template class MatmulShapeInfo>> : public MatmulShapeInfoBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(MatmulSubBlockInfo); @@ -437,6 +437,59 @@ private: IntraBlock intraBlock; }; + +template +class MatmulShapeInfo::value>> + : public MatmulShapeInfoBase { + using L0cT = typename GetMmDstType::Type; + using SrcT = typename A_TYPE::T; + MATMUL_USE_MODULE(MatmulShapeTiling); +public: + using BASE_MODULE = MatmulShapeInfoBase; + + __aicore__ inline MatmulShapeInfo() = default; + __aicore__ inline ~MatmulShapeInfo() = default; + + __aicore__ inline void SetTransposeScaleA(bool isTransposeScaleA = false) + { + isTransposeScaleA_ = isTransposeScaleA; + } + + __aicore__ inline void SetTransposeScaleB(bool isTransposeScaleB = false) + { + isTransposeScaleB_ = isTransposeScaleB; + } + + __aicore__ inline void InitParams() + { + BASE_MODULE::isTransposeA_ = false; + BASE_MODULE::isTransposeB_ = false; + SetTransposeScaleA(false); + SetTransposeScaleB(true); + + const auto& tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling(); + BASE_MODULE::SetOrgShape(tiling.GetM(), tiling.GetN(), tiling.GetKa(), tiling.GetKb(), tiling.GetN()); + BASE_MODULE::SetSingleShape(tiling.GetSingleCoreM(), tiling.GetSingleCoreN(), tiling.GetSingleCoreK()); + + MATMUL_MODULE(MatmulShapeTiling)->template CheckTiling(); + } + + template + __aicore__ inline bool IsTransposeScaleA() const + { + return isTransposeScaleA_; + } + + template + __aicore__ inline bool IsTransposeScaleB() const + { + return isTransposeScaleB_; + } + +private: + bool isTransposeScaleA_ { false }; + bool isTransposeScaleB_ { true }; +}; } // namespace Detail } // namespace Impl } // namespace AscendC diff --git a/impl/matmul/param/matmul_shape_tiling.h b/impl/matmul/param/matmul_shape_tiling.h index 71fabae1..2f079b41 100644 --- a/impl/matmul/param/matmul_shape_tiling.h +++ b/impl/matmul/param/matmul_shape_tiling.h @@ -37,16 +37,21 @@ public: return tiling_; } - template + template __aicore__ inline void CheckTiling() { #ifdef ASCENDC_CPU_DEBUG NumericalValidCheck(); ShareInfoCheck(); - ShapeVaildCheck(); - DepthCheck(); - ConfigCommonCheck(); - ConfigSpecificCheck(); + if constexpr (!HasScalePosition::value && !HasScalePosition::value) { + ShapeVaildCheck(); + DepthCheck(); + ConfigCommonCheck(); + ConfigSpecificCheck(); + } else { + MxShapeVaildCheck(); + DepthCheck(); + } #endif } @@ -106,7 +111,7 @@ private: }); } - template + template __aicore__ inline void ShapeVaildCheck() { ASCENDC_ASSERT((tiling_.GetBaseM() * tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_), { @@ -152,6 +157,32 @@ private: } } + template + __aicore__ inline void MxShapeVaildCheck() + { +#if defined(__DAV_C310__) + // A + scaleA = 220 -- Gitee From 899343c39208be076fd78fb8dd029388638ecc60 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:35:41 +0000 Subject: [PATCH 07/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/param/matmul_tensor_info.h | 104 ++++++++++++++++++++- impl/matmul/param/matmul_usr_define_info.h | 18 ++-- 2 files changed, 110 insertions(+), 12 deletions(-) diff --git a/impl/matmul/param/matmul_tensor_info.h b/impl/matmul/param/matmul_tensor_info.h index 980f9ace..33cbd624 100644 --- a/impl/matmul/param/matmul_tensor_info.h +++ b/impl/matmul/param/matmul_tensor_info.h @@ -132,7 +132,7 @@ private: }; template -class MatmulTensorInfo>> { +class MatmulTensorInfo && !HasScalePosition::value>> { using SrcT = typename INPUT_TYPE::T; MATMUL_USE_MODULE(MatmulShapeInfo); @@ -186,7 +186,7 @@ constexpr bool IsSparseMatmul = (INPUT_TYPE::TAG == InputTypeTag::B) && HasSparseIndex() && DoMatmulMDL(MM_CFG); template -class MatmulTensorInfo>> { +class MatmulTensorInfo && !HasScalePosition::value>> { using SrcT = typename INPUT_TYPE::T; MATMUL_USE_MODULE(MatmulShapeInfo); @@ -257,7 +257,105 @@ private: GlobalTensor indexGlobal_; LocalTensor indexLocal_; }; + +template +class MatmulTensorInfo::value && + (INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB)>> { + using SrcT = float8_e8m0_t; + + MATMUL_USE_MODULE(MatmulShapeInfo); +public: + __aicore__ inline MatmulTensorInfo() = default; + __aicore__ inline ~MatmulTensorInfo() = default; + + __aicore__ inline LocalTensor GetLocalTensor() const + { + LocalTensor localMatrix; + localMatrix.SetAddr(localMatrix_.address_); + return localMatrix; + } + + template + __aicore__ inline GlobalTensor GetGlobalTensor() const + { + GlobalTensor globalMatrix; + globalMatrix.SetGlobalBuffer(globalMatrix_); + return globalMatrix; + } + + template + __aicore__ inline void SetGlobalTensor(const GlobalTensor& globalMatrix, bool isTranspose) + { + globalMatrix_ = globalMatrix.address_; + if constexpr (INPUT_TYPE::TAG == InputTypeTag::scaleA) { + CheckMatrixScaleA(isTranspose); + MATMUL_MODULE(MatmulShapeInfo)->SetTransposeScaleA(isTranspose); + } else { + CheckMatrixScaleB(isTranspose); + MATMUL_MODULE(MatmulShapeInfo)->SetTransposeScaleB(isTranspose); + } + } + + __aicore__ inline void SetLocalTensor(const LocalTensor& localMatrix, bool isTranspose) + { + localMatrix_.address_ = localMatrix.address_; + if constexpr (INPUT_TYPE::TAG == InputTypeTag::scaleA) { + CheckMatrixScaleA(isTranspose); + CheckMatrixScaleAFromLocalMemory(); + MATMUL_MODULE(MatmulShapeInfo)->SetTransposeScaleA(isTranspose); + } else { + CheckMatrixScaleB(isTranspose); + CheckMatrixScaleBFromLocalMemory(); + MATMUL_MODULE(MatmulShapeInfo)->SetTransposeScaleB(isTranspose); + } + } + + template + __aicore__ inline int GetBaseUseHeight() const + { + if constexpr (!isTrans) { + return MATMUL_CONST_PARAM_VAR.baseUseM_; + } else { + return MATMUL_CONST_PARAM_VAR.baseUseK_; + } + } + +private: + __aicore__ inline void CheckMatrixScaleA(bool isTransposeScaleA) + { + ASCENDC_ASSERT((isTransposeScaleA <= INPUT_TYPE::isTrans), { + KERNEL_LOG(KERNEL_ERROR, "It is not allowed to set matrix scaleA transpose when matmul scaleA transpose is not defined."); + }); +#if defined(__DAV_C310__) + if constexpr (IsSameType::value) { + ASCENDC_ASSERT(!isTransposeScaleA, { KERNEL_LOG(KERNEL_ERROR, + "When matrix scaleA DType is float8_e8m0_t, matrix scaleA should not be transposed");}); + } +#endif + } + + __aicore__ inline void CheckMatrixScaleAFromLocalMemory() {} + + __aicore__ inline void CheckMatrixScaleB(bool isTransposeScaleB) + { + ASCENDC_ASSERT((isTransposeScaleB <= INPUT_TYPE::isTrans), { + KERNEL_LOG(KERNEL_ERROR, "It is not allowed to set matrix scaleB transpose when matmul scaleB transpose is not defined."); + }); +#if defined(__DAV_C310__) + if constexpr (IsSameType::value) { + ASCENDC_ASSERT(isTransposeScaleB, { KERNEL_LOG(KERNEL_ERROR, + "When matrix scaleB DType is float8_e8m0_t, matrix scaleB should be transposed");}); + } +#endif + } + + __aicore__ inline void CheckMatrixScaleBFromLocalMemory() {} + + LocalTensor> localMatrix_; + __gm__ SrcT* globalMatrix_; +}; + } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // IMPL_MATMUL_PARAM_MATMUL_TENSOR_INFO_H +#endif // IMPL_MATMUL_PARAM_MATMUL_TENSOR_INFO_H \ No newline at end of file diff --git a/impl/matmul/param/matmul_usr_define_info.h b/impl/matmul/param/matmul_usr_define_info.h index c846649a..9472615c 100644 --- a/impl/matmul/param/matmul_usr_define_info.h +++ b/impl/matmul/param/matmul_usr_define_info.h @@ -18,31 +18,31 @@ namespace AscendC { namespace Impl { namespace Detail { -template +template class MatmulUserDefineInfo { public: - __aicore__ inline void SetSelfDefineData(uint64_t dataPtr) + __aicore__ inline void SetSelfDefineData(USER_DEF_DATA_TYPE dataPtr) {} __aicore__ inline void SetUserDefineInfo(uint64_t tilingPtr) {} - __aicore__ inline uint64_t GetSelfDefineData() const + __aicore__ inline USER_DEF_DATA_TYPE GetSelfDefineData() const {} __aicore__ inline uint64_t GetUserDefineInfo() const {} private: - uint64_t dataPtr_; + USER_DEF_DATA_TYPE dataPtr_; uint64_t tilingPtr_; }; -template -class MatmulUserDefineInfo +class MatmulUserDefineInfo::IsSupportUserDefine()>> { public: - __aicore__ inline void SetSelfDefineData(uint64_t dataPtr) + __aicore__ inline void SetSelfDefineData(USER_DEF_DATA_TYPE dataPtr) { dataPtr_ = dataPtr; } @@ -52,7 +52,7 @@ public: tilingPtr_ = tilingPtr; } - __aicore__ inline uint64_t GetSelfDefineData() const + __aicore__ inline USER_DEF_DATA_TYPE GetSelfDefineData() const { return dataPtr_; } @@ -63,7 +63,7 @@ public: } private: - uint64_t dataPtr_; + USER_DEF_DATA_TYPE dataPtr_; uint64_t tilingPtr_; }; } // namespace Detail -- Gitee From fe57f48fcca974e835a1e0864f08b36fdcb849bc Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:36:26 +0000 Subject: [PATCH 08/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/policy/matmul_policy.h | 54 ++++++++++++++++++--- impl/matmul/policy/matmul_private_modules.h | 43 +++++++++++----- 2 files changed, 78 insertions(+), 19 deletions(-) diff --git a/impl/matmul/policy/matmul_policy.h b/impl/matmul/policy/matmul_policy.h index 34dec5d0..8b26e9a1 100644 --- a/impl/matmul/policy/matmul_policy.h +++ b/impl/matmul/policy/matmul_policy.h @@ -1,5 +1,5 @@ /** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ #define IMPL_MATMUL_POLICY_MATMUL_POLICY_H #include "../context/context.h" -#include "../feature_trait/matmul_feature_trait.h" +#include "../utils/matmul_utils.h" #include "../resource/cube_in_buffer/cube_in_buffer.h" #include "../resource/cube_out_buffer/cube_out_buffer.h" #include "../scheduler/bias/bias_scheduler.h" @@ -36,17 +36,33 @@ struct MatmulPolicy { public: constexpr static PolicyType POLICY_TYPE = PolicyType::MATMUL_DEFAULT; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using Context = MatmulContext; using CubeOutBuffer = AscendC::Impl::Detail::CubeOutBuffer; using CopyCubeOut = AscendC::Impl::Detail::CopyCubeOut; + + using TransT_B = decltype(GetTransBDataType()); using CopyCubeInA = AscendC::Impl::Detail::CopyCubeIn, MM_CFG>; - using CopyCubeInB = CopyCubeIn, MM_CFG>; - using CubeInBufferA = CubeInBuffer, MM_CFG>; - using CubeInBufferB = CubeInBuffer, MM_CFG>; - using Scheduler = MatmulScheduler; + using CopyCubeInB = AscendC::Impl::Detail::CopyCubeIn, MM_CFG>; + using CubeInBufferA = AscendC::Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = AscendC::Impl::Detail::CubeInBuffer, MM_CFG>; + + using Scheduler = AscendC::Impl::Detail::MatmulScheduler; using BatchScheduler = AscendC::Impl::Detail::BatchScheduler; using BiasScheduler = AscendC::Impl::Detail::BiasScheduler; + + using UserDefDataType = uint64_t; +}; + +template +struct MatmulWithScalePolicy : public MatmulPolicy +{ +public: + // for scale + using CubeInBufferScaleA = AscendC::Impl::Detail::CubeInBuffer, MM_CFG>; + using CopyCubeInScaleA = AscendC::Impl::Detail::CopyCubeIn, MM_CFG>; + using CubeInBufferScaleB = AscendC::Impl::Detail::CubeInBuffer, MM_CFG>; + using CopyCubeInScaleB = AscendC::Impl::Detail::CopyCubeIn, MM_CFG>; }; /* @@ -88,6 +104,30 @@ public: constexpr static PolicyType POLICY_TYPE = PolicyType::MATMUL_LOWER_TRIANGULAR; using Scheduler = MatmulScheduler; }; + +/* + SplitMMatmulPolicy is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + SplitMMatmulPolicy is only for internal usage, does not support extension or customized specialization! +*/ +template +struct SplitMMatmulPolicy : public MatmulPolicy +{ +public: + using CopyCubeOut = AscendC::Impl::Detail::CopyCubeOut; +}; + +/* + SplitNMatmulPolicy is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + SplitNMatmulPolicy is only for internal usage, does not support extension or customized specialization! +*/ +template +struct SplitNMatmulPolicy : public MatmulPolicy +{ +public: + using CopyCubeOut = AscendC::Impl::Detail::CopyCubeOut; +}; } // namespace Detail } // namespace Impl } // namespace AscendC diff --git a/impl/matmul/policy/matmul_private_modules.h b/impl/matmul/policy/matmul_private_modules.h index c52cc851..8f245b58 100644 --- a/impl/matmul/policy/matmul_private_modules.h +++ b/impl/matmul/policy/matmul_private_modules.h @@ -15,6 +15,7 @@ #ifndef IMPL_MATMUL_POLICY_MATMUL_PRIVATE_MODULES_H #define IMPL_MATMUL_POLICY_MATMUL_PRIVATE_MODULES_H +#include "../param/matmul_cross_core_sync.h" #include "../param/matmul_shape_info.h" #include "../param/matmul_shape_tiling.h" #include "../param/matmul_subblock_info.h" @@ -48,38 +49,56 @@ namespace AscendC { namespace Impl { namespace Detail { -template +template struct MatmulPrivateModules { + using TRANS_B_TYPE = decltype(GetTransBDataType()); using CopyCubeInParamsA = CopyCubeInParams>; - using CopyCubeInParamsB = CopyCubeInParams>; + using CopyCubeInParamsB = CopyCubeInParams>; using MatmulTensorInfoA = MatmulTensorInfo>; - using MatmulTensorInfoB = MatmulTensorInfo>; + using MatmulTensorInfoB = MatmulTensorInfo>; using MatmulSubBlockInfo = AscendC::Impl::Detail::MatmulSubBlockInfo; using MatmulShapeTiling = AscendC::Impl::Detail::MatmulShapeTiling; + using MatmulCrossCoreSync = AscendC::Impl::Detail::MatmulCrossCoreSync; using DataCopyUtilsA = CopyTileToCubeWrapper>; - using DataCopyUtilsB = CopyTileToCubeWrapper>; + using DataCopyUtilsB = CopyTileToCubeWrapper>; using DataCopyWrapperA = DataCopyWrapper>; - using DataCopyWrapperB = DataCopyWrapper>; + using DataCopyWrapperB = DataCopyWrapper>; using BatchCopyCubeInParamsA = BatchCopyCubeInParams>; - using BatchCopyCubeInParamsB = BatchCopyCubeInParams>; + using BatchCopyCubeInParamsB = BatchCopyCubeInParams>; using BatchCopyCubeInA = BatchCopyCubeIn>; - using BatchCopyCubeInB = BatchCopyCubeIn>; + using BatchCopyCubeInB = BatchCopyCubeIn>; + + using CopyCubeInParamsScaleA = CopyCubeInParams>; + using CopyCubeInParamsScaleB = CopyCubeInParams>; + using MatmulTensorInfoScaleA = MatmulTensorInfo>; + using MatmulTensorInfoScaleB = MatmulTensorInfo>; + using DataCopyUtilsScaleA = CopyTileToCubeWrapper>; + using DataCopyUtilsScaleB = CopyTileToCubeWrapper>; + using DataCopyWrapperScaleA = DataCopyWrapper>; + using DataCopyWrapperScaleB = DataCopyWrapper>; + using LocalWorkspace = MatmulLocalWorkspace; using MatmulShapeInfo = AscendC::Impl::Detail::MatmulShapeInfo; using MatmulQuantProcessor = AscendC::Impl::Detail::MatmulQuantProcessor; using MatmulAntiQuantProcessor = AscendC::Impl::Detail::MatmulAntiQuantProcessor; - using MatmulUserDefineInfo = AscendC::Impl::Detail::MatmulUserDefineInfo; + + using UserDefDataTypeFromPolicy = typename MATMUL_POLICY_::UserDefDataType; + using UserDefDataType = typename Conditional::IsSupportUBToL1Singleshape(), UserDefDataTypeFromPolicy, uint64_t>::type; + using MatmulUserDefineInfo = AscendC::Impl::Detail::MatmulUserDefineInfo; + using MatmulUnitFlag = AscendC::Impl::Detail::MatmulUnitFlag; using BatchLoop = AscendC::Impl::Detail::BatchLoop, MM_CFG>; using CopyCubeOutUtils = AscendC::Impl::Detail::CopyCubeOutWrapper; using PartialOutUtils = AscendC::Impl::Detail::PartialOutUtils; // using compute modules - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; + using L0aT = typename Conditional::value, typename GetL0DataType::Type, typename GetL0DataType::Type>::type; + using L0bT = typename Conditional::value, typename GetL0DataType::Type, typename GetL0DataType::Type>::type; using LoadToA2 = LoadToL0A; - using LoadToB2 = LoadToL0B, MM_CFG>; + using LoadToB2 = LoadToL0B, MM_CFG>; using TBufPoolL0 = AscendC::Impl::Detail::TBufPoolL0; - using MmadCompute = AscendC::Impl::Detail::MmadCompute; + using MmadCompute = AscendC::Impl::Detail::MmadCompute; // using Bias modules using CopyBiasIn = AscendC::Impl::Detail::CopyBiasIn; @@ -95,4 +114,4 @@ struct MatmulPrivateModules { } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // _MATMUL_PRIVATE_MODULES_H_ \ No newline at end of file +#endif // _MATMUL_PRIVATE_MODULES_H_ -- Gitee From d9d115ba22ea0abde9528438f16f0d484a433d49 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:38:21 +0000 Subject: [PATCH 09/56] add Signed-off-by: jiangchengcheng-on --- .../bias_buffer/c1_buffer/c1_buffer.h | 23 +++++++++++++++++++ .../bias_buffer/c1_buffer/c1_buffer_intf.h | 10 ++++++++ 2 files changed, 33 insertions(+) diff --git a/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer.h b/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer.h index 6ad00126..c8c8a195 100644 --- a/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer.h +++ b/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer.h @@ -68,8 +68,31 @@ public: qidBias_.FreeAllEvent(); } + __aicore__ inline uint64_t GetBufferHeadAddr() + { +#if defined(__DAV_C310__) + return GetTQueHeadAddr(qidBias_); +#else + return 0; +#endif + } + private: + +#if defined(__DAV_C310__) + static constexpr TQueConfig staticL1Evt = { .nd2nz = false, + .nz2nd = false, + .scmBlockGroup = false, + .bufferLen = 0, + .bufferNumber = 1, + .consumerSize = 0, + .consumer = {}, + .enableStaticEvtId = true }; + TQue qidBias_; +#else TQue qidBias_; +#endif + }; } // namespace Detail diff --git a/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer_intf.h b/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer_intf.h index 1a6bbee3..a65d5c26 100644 --- a/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer_intf.h +++ b/impl/matmul/resource/bias_buffer/c1_buffer/c1_buffer_intf.h @@ -76,6 +76,16 @@ public: * @return: void */ __aicore__ inline void Destroy() {} + + /** + * @description: Get buffer head address + * @param: void + * @return: Buffer head address + */ + __aicore__ inline uint64_t GetBufferHeadAddr() + { + return 0; + } }; } // namespace Detail -- Gitee From 855a74bacb74da3279b6421033ed69b05ac3da28 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:40:26 +0000 Subject: [PATCH 10/56] add Signed-off-by: jiangchengcheng-on --- .../resource/cube_in_buffer/cube_in_buffer.h | 3 +++ .../cube_in_buffer_double_buffer.h | 27 ++++++++++++++----- .../cube_in_buffer_double_buffer_sparse.h | 2 +- .../cube_in_buffer_double_global_buffer.h | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer.h index eb75ed42..d27f12ca 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer.h @@ -23,5 +23,8 @@ #include "cube_in_buffer_double_buffer_sparse.h" #include "cube_in_buffer_n_buffer.h" #endif +#if defined(__DAV_C310__) +#include "cube_in_buffer_n_buffer.h" +#endif #endif // _CUBE_IN_BUFFER_H_ \ No newline at end of file diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer.h index 6e75f064..003d1297 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer.h @@ -27,8 +27,9 @@ namespace Detail { */ template class CubeInBuffer() == CubeInBufferType::DOUBLE_BUFFER && - !(POLICY_TYPE == PolicyType::MATMUL_NBUFFER_33 && INPUT_TYPE::TAG == InputTypeTag::A)>> + (GetCubeInBufferType() == CubeInBufferType::DOUBLE_BUFFER && + !(POLICY_TYPE == PolicyType::MATMUL_NBUFFER_33 && INPUT_TYPE::TAG == InputTypeTag::A)) || + GetCubeInBufferType() == CubeInBufferType::DOUBLE_BUFFER_MX>> { MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(MatmulShapeInfo); @@ -42,7 +43,11 @@ public: int32_t stepSize = GetTotalCacheNum(); cacheFactor_ = (cacheNum / stepSize - 1) & 1; int32_t queDepth = cacheFactor_ == 0 ? SINGLE_QUE : DOUBLE_QUE; - GetTPipePtr()->InitBuffer(qid_, queDepth, matrixByteSize * stepSize + GetBankConflictSize()); + if constexpr(INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB) { + GetTPipePtr()->InitBuffer(qid_, queDepth, matrixByteSize * stepSize); + } else { + GetTPipePtr()->InitBuffer(qid_, queDepth, matrixByteSize * stepSize + GetBankConflictSize()); + } #if __CCE_AICORE__ == 200 if (IsFromUB()) { eventIDMte3ToMte1_ = static_cast(GetTPipePtr()->AllocEventID()); @@ -142,10 +147,20 @@ public: private: __aicore__ inline int32_t GetTotalCacheNum() { - if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { - return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + constexpr bool isScaleTag = (INPUT_TYPE::TAG == InputTypeTag::scaleA || + INPUT_TYPE::TAG == InputTypeTag::scaleB); + if constexpr (isScaleTag) { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::scaleA) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetScaleFactorA(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetScaleFactorB(); + } } else { - return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } } } diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer_sparse.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer_sparse.h index f16312e9..df8ec0f4 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer_sparse.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_buffer_sparse.h @@ -200,4 +200,4 @@ private: } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_SPARSE_H_ \ No newline at end of file +#endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_SPARSE_H_ diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h index a3ba9cf6..9233487a 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h @@ -156,4 +156,4 @@ private: } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_ \ No newline at end of file +#endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_ -- Gitee From 6e0936e7e5810e92840519641bdd05e577caecf9 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:41:18 +0000 Subject: [PATCH 11/56] add Signed-off-by: jiangchengcheng-on --- .../resource/cube_in_buffer/cube_in_buffer_intf.h | 10 ++++++++++ .../resource/cube_in_buffer/cube_in_buffer_normal.h | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_intf.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_intf.h index a78c4305..1f644235 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_intf.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_intf.h @@ -114,6 +114,16 @@ public: * @return: void */ __aicore__ inline void DeQue() {} + + /** + * @description: Get buffer head address + * @param: void + * @return: Buffer head address + */ + __aicore__ inline uint64_t GetBufferHeadAddr() + { + return 0; + } }; } // namespace Detail diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_normal.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_normal.h index 43ad23a2..a5c4123c 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_normal.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_normal.h @@ -27,7 +27,8 @@ namespace Detail { */ template class CubeInBuffer() == CubeInBufferType::NORMAL>> { +(GetCubeInBufferType() == CubeInBufferType::NORMAL || +GetCubeInBufferType() == CubeInBufferType::NORMAL_MX)>> { MATMUL_USE_MODULE(MatmulShapeInfo); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(KLoop); -- Gitee From 1126f600f36b6eaeaed917db636911be0d942304 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:41:54 +0000 Subject: [PATCH 12/56] add Signed-off-by: jiangchengcheng-on --- .../cube_in_buffer/cube_in_buffer_single_buffer.h | 10 ++++++++++ .../resource/cube_in_buffer/cube_in_buffer_utils.h | 10 +++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_single_buffer.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_single_buffer.h index e55e9302..68dd66dd 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_single_buffer.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_single_buffer.h @@ -92,6 +92,16 @@ public: (void) qid_.DeQue(); } + __aicore__ inline uint64_t GetBufferHeadAddr() + { +// wait for GetTQueHeadAddr +#if defined(__DAV_C310__) + return GetTQueHeadAddr(qid_); +#else + return 0; +#endif + } + private: typename CubeInQueType::QUE qid_; LocalTensor cacheHead_; diff --git a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_utils.h b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_utils.h index 5075e03c..7a7fcade 100644 --- a/impl/matmul/resource/cube_in_buffer/cube_in_buffer_utils.h +++ b/impl/matmul/resource/cube_in_buffer/cube_in_buffer_utils.h @@ -43,6 +43,8 @@ enum class CubeInBufferType : uint8_t { SINGLE_GLOBAL_BUFFER, DOUBLE_GLOBAL_BUFFER, DOUBLE_BUFFER_SPARSE, + NORMAL_MX, + DOUBLE_BUFFER_MX, }; template @@ -67,8 +69,10 @@ __aicore__ inline constexpr bool IsSetNoDB() template __aicore__ inline constexpr CubeInBufferType GetCubeInBufferType() { - if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { + if constexpr (InputPhyPosIsL1()) { return CubeInBufferType::NONE; + } else if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape() && InputPhyPosIsUB()) { + return CubeInBufferType::SINGLE_BUFFER; } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { if constexpr (IsSetDoubleGlobalQue()) { return CubeInBufferType::DOUBLE_GLOBAL_BUFFER; @@ -80,12 +84,16 @@ __aicore__ inline constexpr CubeInBufferType GetCubeInBufferType() } else if constexpr (DoMatmulNorm(MM_CFG)) { if constexpr (IsSetNoDB()) { return CubeInBufferType::SINGLE_BUFFER; + } else if (IsScaleTag()) { + return CubeInBufferType::NORMAL_MX; } else { return CubeInBufferType::NORMAL; } } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { if constexpr (HasSparseIndex()) { return CubeInBufferType::DOUBLE_BUFFER_SPARSE; + } else if (IsScaleTag()) { + return CubeInBufferType::DOUBLE_BUFFER_MX; } else { return CubeInBufferType::DOUBLE_BUFFER; } -- Gitee From 222d6df475084019d6c211c80a497307dc11b814 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:44:00 +0000 Subject: [PATCH 13/56] add Signed-off-by: jiangchengcheng-on --- .../cube_out_buffer/cube_out_buffer.h | 1 + .../cube_out_buffer_co1_shared.h | 92 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 impl/matmul/resource/cube_out_buffer/cube_out_buffer_co1_shared.h diff --git a/impl/matmul/resource/cube_out_buffer/cube_out_buffer.h b/impl/matmul/resource/cube_out_buffer/cube_out_buffer.h index 39d0ccaa..188924d1 100644 --- a/impl/matmul/resource/cube_out_buffer/cube_out_buffer.h +++ b/impl/matmul/resource/cube_out_buffer/cube_out_buffer.h @@ -16,6 +16,7 @@ #include "cube_out_buffer_no_unit_flag.h" #include "cube_out_buffer_unit_flag.h" +#include "cube_out_buffer_co1_shared.h" #include "cube_out_buffer_extend.h" #endif // IMPL_MATMUL_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_H \ No newline at end of file diff --git a/impl/matmul/resource/cube_out_buffer/cube_out_buffer_co1_shared.h b/impl/matmul/resource/cube_out_buffer/cube_out_buffer_co1_shared.h new file mode 100644 index 00000000..7723b6a1 --- /dev/null +++ b/impl/matmul/resource/cube_out_buffer/cube_out_buffer_co1_shared.h @@ -0,0 +1,92 @@ +/** +* Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_out_buffer_co1_shared.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_CO1_SHARED_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_CO1_SHARED_H + +#include "cube_out_buffer_base.h" +#include "../../utils/matmul_utils.h" +#include "lib/matmul/tiling.h" +#include "../../feature_trait/matmul_feature_trait.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + CubeOutBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeOutBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeOutBuffer> +{ +public: + __aicore__ inline CubeOutBuffer() {}; + __aicore__ inline ~CubeOutBuffer() {}; + __aicore__ inline void Init(int32_t cacheSize = 1, uint32_t lenFactor = 1) { + GetTPipePtr()->InitBuffer(*static_cast(gCO1Que), L0C_BUF_BLOCK_NUM, L0C_BUF_BLOCK_LEN); + }; + + __aicore__ inline LocalTensor AllocTensor() + { + cMatrix_ = (static_cast(gCO1Que))->AllocTensor(Ceil(ToMatmulConfig(MM_CFG).sharedCO1BufferSize, L0C_BUF_BLOCK_LEN)); + if constexpr (ToMatmulConfig(MM_CFG).iterateMode != IterateMode::ITERATE_MODE_ALL) { + co1UsedList[co1UsedNumber++] = cMatrix_.GetBufferHandle(); + } + return cMatrix_; + } + + __aicore__ inline LocalTensor GetTensor() + { + if constexpr (ToMatmulConfig(MM_CFG).iterateMode != IterateMode::ITERATE_MODE_ALL) { + TBuffAddr addr = (static_cast(gCO1Que))->GetBufferAddr(co1UsedList[0]); + cMatrix_.SetAddr(addr); + } + return cMatrix_; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + (static_cast(gCO1Que))->EnQue(tensor); + } + + __aicore__ inline LocalTensor DeQue() + { + return (static_cast(gCO1Que))->DeQue(); + } + + __aicore__ inline void FreeTensor(LocalTensor &co1Local) + { + (static_cast(gCO1Que))->FreeTensor(co1Local); + if constexpr (ToMatmulConfig(MM_CFG).iterateMode != IterateMode::ITERATE_MODE_ALL) { + co1UsedNumber--; + for (int i = 0; i < co1UsedNumber; i++) { + co1UsedList[i] = co1UsedList[i + 1]; + } + } + } + + __aicore__ inline void Destroy() + {} + +private: + LocalTensor cMatrix_; + constexpr static int32_t MAX_CO1_BUFFER_NUM = 8; + TBufHandle co1UsedList[MAX_CO1_BUFFER_NUM]; + int co1UsedNumber = 0; +}; +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H \ No newline at end of file -- Gitee From 3bd4ef69c7cbd586f4d76839843af631eef0947a Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:50:19 +0000 Subject: [PATCH 14/56] del Signed-off-by: jiangchengcheng-on --- impl/matmul/resource/l0_buffer/tbuf_pool_l0.h | 2 +- impl/matmul/resource/l0_buffer/tbuf_pool_l0_base.h | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/impl/matmul/resource/l0_buffer/tbuf_pool_l0.h b/impl/matmul/resource/l0_buffer/tbuf_pool_l0.h index 903fa93d..c2e83040 100644 --- a/impl/matmul/resource/l0_buffer/tbuf_pool_l0.h +++ b/impl/matmul/resource/l0_buffer/tbuf_pool_l0.h @@ -20,4 +20,4 @@ #include "tbuf_pool_l0_cache.h" #include "tbuf_pool_l0_v200.h" -#endif // IMPL_MATMUL_RESOURCE_L0_BUFFER_TBUF_POOL_L0_H +#endif // IMPL_MATMUL_RESOURCE_L0_BUFFER_TBUF_POOL_L0_H \ No newline at end of file diff --git a/impl/matmul/resource/l0_buffer/tbuf_pool_l0_base.h b/impl/matmul/resource/l0_buffer/tbuf_pool_l0_base.h index 93837214..39613a74 100644 --- a/impl/matmul/resource/l0_buffer/tbuf_pool_l0_base.h +++ b/impl/matmul/resource/l0_buffer/tbuf_pool_l0_base.h @@ -37,6 +37,9 @@ public: bool isL0Db; if constexpr (NormInitScene && Impl::Detail::MatmulFeatureTrait().IsSupportMNL0DB()) { isL0Db = true; + } else if constexpr (IsBasic(MM_CFG) && NormInitScene && + Impl::Detail::MatmulFeatureTrait().IsSupportLoad2dV2()) { + isL0Db = false; } else { const auto& tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling(); isL0Db = (tiling.GetDbL0A() - 1) & (tiling.GetDbL0B() - 1); @@ -52,7 +55,7 @@ public: useL0PingPong_ = static_cast(isL0Db); } - template + template __aicore__ inline TBufPoolL0Base& Allocate() { if constexpr (!IS_INTRA_BLOCK) { WaitFlag(l0PingPongFlag_); @@ -60,14 +63,14 @@ public: return *this; } - template + template __aicore__ inline LocalTensor GetBuffer(uint8_t subIdx = 0) { LocalTensor tempTensor; if constexpr (Pos == TPosition::A2) { tempTensor = l0aBuf_.Get(); if (l0PingPongFlag_ != 0) { - if constexpr (IsSameType::value) { + if constexpr (IsSuppportB4()) { tempTensor = tempTensor[L0AUF_SIZE / sizeof(T)]; } else { tempTensor = tempTensor[L0AUF_SIZE / 2 / sizeof(T)]; @@ -81,7 +84,7 @@ public: } } else { if (l0PingPongFlag_ != 0) { - if constexpr (IsSameType::value) { + if constexpr (IsSuppportB4()) { tempTensor = tempTensor[L0BUF_SIZE / sizeof(T)]; } else { tempTensor = tempTensor[L0BUF_SIZE / 2 / sizeof(T)]; @@ -92,7 +95,7 @@ public: return tempTensor; } - template + template __aicore__ inline bool Hit(uint32_t pos = 0) { return false; } -- Gitee From a2adfb8b8704253f8d60ded4c67bdb8d9a03ff0c Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:52:32 +0000 Subject: [PATCH 15/56] add Signed-off-by: jiangchengcheng-on --- .../local_workspace/matmul_local_workspace.h | 1 - .../base/scheduler_mdl_partial_output.h | 164 ++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 impl/matmul/scheduler/base/scheduler_mdl_partial_output.h diff --git a/impl/matmul/resource/local_workspace/matmul_local_workspace.h b/impl/matmul/resource/local_workspace/matmul_local_workspace.h index b980f910..3cdc6327 100644 --- a/impl/matmul/resource/local_workspace/matmul_local_workspace.h +++ b/impl/matmul/resource/local_workspace/matmul_local_workspace.h @@ -323,7 +323,6 @@ private: int32_t nz2ndOffset_ = 0; int32_t co2Offset_ = 0; }; - } // namespace Detail } // namespace Impl } // namespace AscendC diff --git a/impl/matmul/scheduler/base/scheduler_mdl_partial_output.h b/impl/matmul/scheduler/base/scheduler_mdl_partial_output.h new file mode 100644 index 00000000..012619d3 --- /dev/null +++ b/impl/matmul/scheduler/base/scheduler_mdl_partial_output.h @@ -0,0 +1,164 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file scheduler_mdl_partial_output.h + * \brief partial output, only for aicore like 310 + */ +#ifndef IMPL_MATMUL_SCHEDULER_BASE_SCHEDULER_MDL_PARTIAL_OUTPUT_H +#define IMPL_MATMUL_SCHEDULER_BASE_SCHEDULER_MDL_PARTIAL_OUTPUT_H + +#include "scheduler_intf.h" +#include "scheduler_mdl_common.h" +namespace AscendC { +namespace Impl { +namespace Detail { + +/* + MatmulScheduler is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + MatmulScheduler is only for internal usage, does not support extension or customized specialization! +*/ +template +class MatmulScheduler::IsSupportL0CToUB() && ToMatmulConfig(MM_CFG).isPartialOutput)>> + : public MatmulMDLSchedulerCommon +{ + MATMUL_USE_MODULE(KLoop); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(LoadToA2); + MATMUL_USE_MODULE(LoadToB2); + MATMUL_USE_MODULE(TBufPoolL0); + MATMUL_USE_MODULE(BiasScheduler); + MATMUL_USE_MODULE(CubeOutBuffer); + MATMUL_USE_MODULE(CopyCubeOut); + MATMUL_USE_MODULE(Context); + + using TransAT = typename A_TYPE::T; + using TransBT = typename decltype(GetTransBDataType())::T; + using BiasT = typename BIAS_TYPE::T; + using DstT = typename C_TYPE::T; + using L0cT = typename GetMmDstType::Type; + +public: + using BASE_MODULE = + AscendC::Impl::Detail::MatmulMDLSchedulerCommon; + + __aicore__ inline bool ScheduleOnce(bool enPartialSum) + { + MATMUL_MODULE(BiasScheduler)->SetBias(MATMUL_MODULE(BiasScheduler)->IsBias() && !enPartialSum); + if (!MoveNext()) { + return false; + } + if (!enPartialSum) { + MATMUL_MODULE(CubeOutBuffer)->AllocTensor(); + } + PartialK(enPartialSum); + return true; + } + +private: + __aicore__ inline bool MoveNext() + { + if (unlikely(BASE_MODULE::isFirstIter_)) { + MATMUL_MODULE(KLoop)->InnerStart(); + return BASE_MODULE::MoveOnFirstIterate(); + } else { + if (MATMUL_MODULE(KLoop)->InnerNext()) { + return true; + } + + MATMUL_MODULE(KLoop)->InnerStart(); + if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::UNDEF) { + if (likely(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_M))) { + return BASE_MODULE::MoveOnIterateOrderM(); + } else { + ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_N)), { + KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder()); + }); + return BASE_MODULE::MoveOnIterateOrderN(); + } + } else if constexpr (ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) { + return BASE_MODULE::MoveOnIterateOrderM(); + } else { + return BASE_MODULE::MoveOnIterateOrderN(); + } + } + } + + __aicore__ inline void PartialK(bool enPartialSum) + { + PartialKMultiIter(enPartialSum); + } + + __aicore__ inline void PartialKMultiIter(bool enPartialSum) + { + // init split params for left and right matrix + SplitParams aL0Params = BASE_MODULE::InitSplitAParams(); + SplitParams bL0Params = BASE_MODULE::InitSplitBParams(); + bool isATranspose = MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + bool isBTranspose = MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + + // curKaOuterIdx and curKbOuterIdx are used to decide if left or right matrix need to clear its l1 buffer + int32_t curKaOuterIdx = MATMUL_MODULE(KLoop)->GetOuterKaIdx(); + int32_t curKbOuterIdx = MATMUL_MODULE(KLoop)->GetOuterKbIdx(); + + // CopyIn + LocalTensor a1; + LocalTensor b1; + BASE_MODULE::CopyIn(a1, b1); + LocalTensor bias = BASE_MODULE::SplitBias(bL0Params.axisL0Len); + Compute(a1, b1, bias, enPartialSum, isATranspose, isBTranspose, aL0Params, bL0Params); + + BASE_MODULE::DoPreloadAWait(); + BASE_MODULE::ClearL1BufferCache(curKaOuterIdx, curKbOuterIdx); + BASE_MODULE::ResetCopyInBuffer(); + } + + __aicore__ inline void Compute(const LocalTensor& a1, const LocalTensor& b1, + LocalTensor& bias, const bool enPartialSum, const bool isATranspose, const bool isBTranspose, + SplitParams& aL0Params, SplitParams& bL0Params) + { + // sL0CInit and sL0CLast are used for Split + bool sL0CInit = false; + bool sL0CLast = false; + BASE_MODULE::SplitPrepare(enPartialSum, isATranspose, isBTranspose, aL0Params, bL0Params, sL0CInit, sL0CLast); + // prepare for Split + int32_t kL1Stride = MATMUL_MODULE(KLoop)->GetBaseBlockShape() * BASE_MODULE::c0Size_; + aL0Params.kAxisL1Offset += MATMUL_MODULE(KLoop)->GetStepInnerIdx() * kL1Stride; + bL0Params.kAxisL1Offset += MATMUL_MODULE(KLoop)->GetStepInnerIdx() * kL1Stride; + ComputeKDB(a1, b1, aL0Params, bL0Params, isATranspose, isBTranspose, sL0CInit, sL0CLast); + } + + __aicore__ inline void ComputeKDB(const LocalTensor& a1, const LocalTensor& b1, + const SplitParams& aL0Params, const SplitParams& bL0Params, + const bool isATranspose, const bool isBTranspose, const bool sL0CInit, const bool sL0CLast) + { + MATMUL_MODULE(TBufPoolL0)->Allocate(); + LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); + LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); + MATMUL_MODULE(TBufPoolL0)->EnQue(); + MATMUL_MODULE(TBufPoolL0)->DeQue(); + BASE_MODULE::CubeCompute(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), a2, b2, aL0Params.axisL0Len, + bL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetBaseShape(), isATranspose, isBTranspose, sL0CInit, sL0CLast); + MATMUL_MODULE(TBufPoolL0)->Free(); + MATMUL_MODULE(BiasScheduler)->Free(); + } +}; +} // namespace Detail +} // namespace Impl +} // namespace AscendC + +#endif // IMPL_MATMUL_SCHEDULER_BASE_SCHEDULER_MDL_PARTIAL_OUTPUT_H \ No newline at end of file -- Gitee From c160ab1e5cd0d1232652b2b87ae141a8584623bd Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:54:28 +0000 Subject: [PATCH 16/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/base/scheduler_base.h | 33 ++++--- .../scheduler/base/scheduler_intrablock.h | 2 +- impl/matmul/scheduler/base/scheduler_mdl.h | 3 +- .../scheduler/base/scheduler_mdl_base.h | 76 +++++++++------- .../scheduler/base/scheduler_mdl_common.h | 89 ++++++++++--------- impl/matmul/scheduler/base/scheduler_norm.h | 54 ++++++----- 6 files changed, 146 insertions(+), 111 deletions(-) diff --git a/impl/matmul/scheduler/base/scheduler_base.h b/impl/matmul/scheduler/base/scheduler_base.h index c843384b..32be63cf 100644 --- a/impl/matmul/scheduler/base/scheduler_base.h +++ b/impl/matmul/scheduler/base/scheduler_base.h @@ -50,9 +50,10 @@ public: MATMUL_USE_MODULE(MatmulQuantProcessor); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulCrossCoreSync); using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; __aicore__ inline void Init(const TCubeTiling *__restrict cubeTiling, TPipe *tpipe) @@ -118,7 +119,9 @@ public: var.tpipe_->InitBuffer(var.qidB2_, 1, L0BSize_); #endif } - InitShareBufEnd(var.tpipe_); + if constexpr (!HasScalePosition::value) { + InitShareBufEnd(var.tpipe_); + } } __aicore__ inline void End() { @@ -132,6 +135,9 @@ public: MATMUL_MODULE(TBufPoolL0)->ResetCache(); MATMUL_MODULE(CubeOutBuffer)->Destroy(); MATMUL_MODULE(MatmulQuantProcessor)->Destroy(); + if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + MATMUL_MODULE(MatmulCrossCoreSync)->End(); + } } __aicore__ inline int64_t GetL0cOffset() @@ -185,7 +191,8 @@ public: #endif protected: - __aicore__ inline void CheckSupportTrianMatmul() { + __aicore__ inline void CheckSupportTrianMatmul() + { if constexpr (!MatmulFeatureTrait::IsSupportTrianMatmul()) { ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Triangular Matmul is not supported on current device."); }); @@ -290,21 +297,23 @@ protected: #else __aicore__ inline void SetAtomic(uint8_t enAtomic) { - if (enAtomic == ATOMIC_ADD) { - SetAtomicAdd(); - } else if (enAtomic == ATOMIC_MAX) { - SetAtomicMax(); - } else if (enAtomic == ATOMIC_MIN) { - SetAtomicMin(); + if constexpr (SupportType()) { + if (enAtomic == ATOMIC_ADD) { + SetAtomicAdd(); + } else if (enAtomic == ATOMIC_MAX) { + SetAtomicMax(); + } else if (enAtomic == ATOMIC_MIN) { + SetAtomicMin(); + } } } __aicore__ inline void GetResultImpl(const GlobalTensor& gm, uint8_t enAtomic, bool enSequentialWrite) { if constexpr (C_TYPE::format != CubeFormat::ND && C_TYPE::format != CubeFormat::ND_ALIGN && - C_TYPE::format != CubeFormat::NZ) { + C_TYPE::format != CubeFormat::NZ && C_TYPE::format != CubeFormat::COLUMN_MAJOR) { ASCENDC_ASSERT((false), { - KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN or NZ."); }); + KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN, COLUMN_MAJOR or NZ."); }); } // remove dependency conflicts only for scene which is not db auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor(); @@ -345,4 +354,4 @@ protected: } // namespace Detail } // namespace Impl } // namespace AscendC -#endif \ No newline at end of file +#endif diff --git a/impl/matmul/scheduler/base/scheduler_intrablock.h b/impl/matmul/scheduler/base/scheduler_intrablock.h index 930144b8..38f416aa 100644 --- a/impl/matmul/scheduler/base/scheduler_intrablock.h +++ b/impl/matmul/scheduler/base/scheduler_intrablock.h @@ -269,4 +269,4 @@ private: } // namespace Impl } // namespace AscendC -#endif \ No newline at end of file +#endif diff --git a/impl/matmul/scheduler/base/scheduler_mdl.h b/impl/matmul/scheduler/base/scheduler_mdl.h index f647f0e6..139656dc 100644 --- a/impl/matmul/scheduler/base/scheduler_mdl.h +++ b/impl/matmul/scheduler/base/scheduler_mdl.h @@ -29,7 +29,8 @@ namespace Detail { template class MatmulScheduler> + enable_if_t + && !(MatmulFeatureTrait::IsSupportL0CToUB() && ToMatmulConfig(MM_CFG).isPartialOutput)>> : public MatmulMDLSchedulerCommon { MATMUL_USE_MODULE(BiasScheduler); diff --git a/impl/matmul/scheduler/base/scheduler_mdl_base.h b/impl/matmul/scheduler/base/scheduler_mdl_base.h index 09254d23..7a1a6c8c 100644 --- a/impl/matmul/scheduler/base/scheduler_mdl_base.h +++ b/impl/matmul/scheduler/base/scheduler_mdl_base.h @@ -46,6 +46,7 @@ class MatmulMDLSchedulerBase : public MatmulSchedulerBase())::T; using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; public: using BASE_MODULE = @@ -120,7 +122,7 @@ protected: MATMUL_MODULE(NLoop)->InnerStart(); // when M inner loop is finished, clear left matrix's data in L1 buffer if (!MATMUL_MODULE(MLoop)->InnerNext()) { - if constexpr (!PhyPosIsL1(A_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(A_TYPE::pos)) { if (MATMUL_MODULE(KLoop)->IsAKL1FullLoad()) { MATMUL_MODULE(CopyCubeInA)->ClearLoadData(); } @@ -128,7 +130,7 @@ protected: // when M outer loop is finished, clear right matrix's data in L1 buffer, // and restart M outer and inner loop if (!MATMUL_MODULE(MLoop)->OuterNext()) { - if constexpr (!PhyPosIsL1(B_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(B_TYPE::pos)) { if (MATMUL_MODULE(KLoop)->IsBKL1FullLoad()) { MATMUL_MODULE(CopyCubeInB)->ClearLoadData(); } @@ -154,7 +156,7 @@ protected: MATMUL_MODULE(MLoop)->InnerStart(); // when N inner loop is finished, clear right matrix's data in L1 buffer if (!MATMUL_MODULE(NLoop)->InnerNext()) { - if constexpr (!PhyPosIsL1(B_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(B_TYPE::pos)) { if (MATMUL_MODULE(KLoop)->IsBKL1FullLoad()) { MATMUL_MODULE(CopyCubeInB)->ClearLoadData(); } @@ -162,7 +164,7 @@ protected: // when N outer loop is finished, clear left matrix's data in L1 buffer, // and restart N outer and inner loop if (!MATMUL_MODULE(NLoop)->OuterNext()) { - if constexpr (!PhyPosIsL1(A_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(A_TYPE::pos)) { if (MATMUL_MODULE(KLoop)->IsAKL1FullLoad()) { MATMUL_MODULE(CopyCubeInA)->ClearLoadData(); } @@ -181,7 +183,7 @@ protected: return true; } - __aicore__ inline void CopyIn(LocalTensor& a1, LocalTensor& b1) + __aicore__ inline void CopyIn(LocalTensor& a1, LocalTensor& b1) { a1 = MATMUL_MODULE(CopyCubeInA)->LoadData( MATMUL_MODULE(MLoop)->GetInnerIdx(), MATMUL_MODULE(KLoop)->GetInnerStartIdx(), @@ -189,6 +191,11 @@ protected: b1 = MATMUL_MODULE(CopyCubeInB)->LoadData( MATMUL_MODULE(KLoop)->GetInnerStartIdx(), MATMUL_MODULE(NLoop)->GetInnerIdx(), MATMUL_MODULE(KLoop)->GetTileShapeB(), MATMUL_MODULE(NLoop)->GetTileShape()); + + if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + MATMUL_MODULE(MatmulCrossCoreSync)->WaitL1Ready(); + } + DoPreloadLoad(); } @@ -201,11 +208,11 @@ protected: MATMUL_MODULE(LoadToB2)->Prepare(isBTranspose, bL0Params.kAxisL1Len); } - __aicore__ inline LocalTensor SplitA(const LocalTensor& a1, + __aicore__ inline LocalTensor SplitA(const LocalTensor& a1, const SplitParams& aL0Params, const bool isATranspose) { if constexpr (DoMatmulSpecialMDL(MM_CFG) || MatmulFeatureTrait().IsSupportMNL0DB()) { - LocalTensor a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + LocalTensor a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToA2)->Load(a2, a1, aL0Params.axisL1Len, aL0Params.kAxisL1Len, aL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetBaseShape(), aL0Params.axisL1Offset, aL0Params.kAxisL1Offset, isATranspose); @@ -216,21 +223,21 @@ protected: int32_t kL0Len = MATMUL_MODULE(KLoop)->GetBaseShape(); // Split if (!(MATMUL_MODULE(TBufPoolL0)->template Hit(posA))) { - LocalTensor a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + LocalTensor a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToA2)->Load(a2, a1, aL0Params.axisL1Len, aL0Params.kAxisL1Len, aL0Params.axisL0Len, kL0Len, aL0Params.axisL1Offset, aL0Params.kAxisL1Offset, isATranspose); return a2; } else { - return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); } } } - __aicore__ inline LocalTensor SplitB(const LocalTensor& b1, + __aicore__ inline LocalTensor SplitB(const LocalTensor& b1, const SplitParams& bL0Params, const bool isBTranspose) { if constexpr (DoMatmulSpecialMDL(MM_CFG) || MatmulFeatureTrait().IsSupportMNL0DB()) { - LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); // Split b2 MATMUL_MODULE(LoadToB2)->Load(b2, b1, bL0Params.axisL1Len, bL0Params.kAxisL1Len, bL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetBaseShape(), bL0Params.axisL1Offset, @@ -242,30 +249,30 @@ protected: int32_t kL0Len = MATMUL_MODULE(KLoop)->GetBaseShape(); if constexpr (HasSparseIndex()) { if (!(MATMUL_MODULE(TBufPoolL0)->template Hit(posB))) { - LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToB2)->Load(b2, b1, bL0Params.axisL1Len, bL0Params.kAxisL1Len, bL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetBaseShape(), bL0Params.axisL1Offset, bL0Params.kAxisL1Offset, isBTranspose, MATMUL_MODULE(CopyCubeInB)->GetSparseIndex()); return b2; } else { - return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); } } else { if (!(MATMUL_MODULE(TBufPoolL0)->template Hit(posB))) { - LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + LocalTensor b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToB2)->Load(b2, b1, bL0Params.axisL1Len, bL0Params.kAxisL1Len, bL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetBaseShape(), bL0Params.axisL1Offset, bL0Params.kAxisL1Offset, isBTranspose); return b2; } else { - return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + return MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); } } } } - __aicore__ inline void CubeCompute(const LocalTensor& cMatrix, const LocalTensor& a2, - const LocalTensor& b2, const uint16_t madM, const uint16_t madN, const uint16_t madK, + __aicore__ inline void CubeCompute(const LocalTensor& cMatrix, const LocalTensor& a2, + const LocalTensor& b2, const uint16_t madM, const uint16_t madN, const uint16_t madK, const bool isATranspose, const bool isBTranspose, const bool sL0CInit, const bool sL0CLast) { int32_t kInnerStartIdx; @@ -291,12 +298,12 @@ protected: __aicore__ inline void ResetCopyInBuffer() { // clear L1 buffers - if constexpr (!PhyPosIsL1(A_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(A_TYPE::pos)) { if (!MATMUL_MODULE(KLoop)->IsAKL1FullLoad()) { MATMUL_MODULE(CopyCubeInA)->Reset(); } } - if constexpr (!PhyPosIsL1(B_TYPE::pos)) { + if constexpr (!PhyPosIsL1OrUB(B_TYPE::pos)) { if (!MATMUL_MODULE(KLoop)->IsBKL1FullLoad()) { MATMUL_MODULE(CopyCubeInB)->Reset(); } @@ -325,7 +332,7 @@ protected: __aicore__ inline void ClearL1BufferCache(int32_t& curKaOuterIdx, int32_t& curKbOuterIdx) { - if constexpr (!PhyPosIsL1(A_TYPE::pos) && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_M && + if constexpr (!PhyPosIsL1OrUB(A_TYPE::pos) && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_M && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_N) { int32_t curKaIdx = MATMUL_MODULE(KLoop)->GetNextOuterKaIdx(); // if next outerKaIdx is not equal to curKaOuterIdx, clear left matrix's data in L1 buffer @@ -334,7 +341,7 @@ protected: curKaOuterIdx = curKaIdx; } } - if constexpr (!PhyPosIsL1(B_TYPE::pos) && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_M && + if constexpr (!PhyPosIsL1OrUB(B_TYPE::pos) && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_M && ToMatmulConfig(MM_CFG).doMTE2Preload != PRELOAD_N) { // if next outerKbIdx is not equal to curKbOuterIdx, clear right matrix's data in L1 buffer int32_t curKbIdx = MATMUL_MODULE(KLoop)->GetNextOuterKbIdx(); @@ -349,10 +356,12 @@ protected: { SplitParams aL0Params; int32_t tilingBaseM = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); - if constexpr (PhyPosIsL1(A_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(A_TYPE::pos)) { aL0Params.axisL1Offset = MATMUL_MODULE(MLoop)->GetInnerIdx() * tilingBaseM; - if constexpr (IsFullStaticTiling(MM_CFG)) { - aL0Params.axisL1Len = CeilAlign(MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(), c0Size_); + // ds && 82 mdl support multi singleshape in l1 + if constexpr (IsFullStaticTiling(MM_CFG) || MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + aL0Params.axisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() != -1 ? + MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() : MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(); aL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgKa() != -1 ? MATMUL_MODULE(MatmulShapeInfo)->GetOrgKa() : MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(); } else { @@ -401,18 +410,21 @@ protected: bL0Params.axisL1Len = MATMUL_MODULE(NLoop)->GetTileBlockShape() * BLOCK_CUBE; bL0Params.axisL0Len = MATMUL_MODULE(NLoop)->GetBaseShape(); } - if constexpr (PhyPosIsL1(B_TYPE::pos)) { - bL0Params.axisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + if constexpr (PhyPosIsL1OrUB(B_TYPE::pos)) { bL0Params.axisL1Offset = MATMUL_MODULE(NLoop)->GetInnerIdx() * tilingBaseN; - if constexpr (IsFullStaticTiling(MM_CFG)) { + // ds && 82 mdl support multi singleshape in l1 + if constexpr (IsFullStaticTiling(MM_CFG) || MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + bL0Params.axisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() != -1 ? + MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() : MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); bL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgKb() != -1 ? MATMUL_MODULE(MatmulShapeInfo)->GetOrgKb() : MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(); } else { + bL0Params.axisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); bL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(); } } else { if constexpr (!MatmulFeatureTrait::IsNeedUB()) { - if constexpr (IsSameTypeV) { + if constexpr (IsSameTypeV) { int32_t stepN = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); if (tilingBaseN % c0Size_ == 0 || stepN == 1) { bL0Params.axisL1Offset = (MATMUL_MODULE(NLoop)->GetInnerIdx() - @@ -437,14 +449,14 @@ protected: { int32_t tilingBaseK = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); int32_t kInnerIdx = MATMUL_MODULE(KLoop)->GetInnerStartIdx(); - if constexpr (PhyPosIsL1(A_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(A_TYPE::pos)) { aL0Params.kAxisL1Offset = kInnerIdx * tilingBaseK; } else { aL0Params.kAxisL1Len = MATMUL_MODULE(KLoop)->GetTileBlockShapeA() * c0Size_; int32_t tilingStepKa = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); aL0Params.kAxisL1Offset = (kInnerIdx - kInnerIdx / tilingStepKa * tilingStepKa) * tilingBaseK; } - if constexpr (PhyPosIsL1(B_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(B_TYPE::pos)) { bL0Params.kAxisL1Offset = kInnerIdx * tilingBaseK; } else { bL0Params.kAxisL1Len = MATMUL_MODULE(KLoop)->GetTileBlockShapeB() * c0Size_; diff --git a/impl/matmul/scheduler/base/scheduler_mdl_common.h b/impl/matmul/scheduler/base/scheduler_mdl_common.h index fca20d7f..d1bce97d 100644 --- a/impl/matmul/scheduler/base/scheduler_mdl_common.h +++ b/impl/matmul/scheduler/base/scheduler_mdl_common.h @@ -45,16 +45,17 @@ class MatmulMDLSchedulerCommon MATMUL_USE_MODULE(Context); MATMUL_USE_MODULE(PartialOutUtils); - using TransT = typename A_TYPE::T; + using TransAT = typename A_TYPE::T; + using TransBT = typename decltype(GetTransBDataType())::T; using BiasT = typename BIAS_TYPE::T; using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; public: using BASE_MODULE = AscendC::Impl::Detail::MatmulMDLSchedulerBase; - __aicore__ inline void GetResult(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false) + __aicore__ inline void GetResult(const GlobalTensor& gm, uint8_t enAtomic = 0, bool enSequentialWrite = false) { if constexpr (MatmulFeatureTrait().IsSupportMNL0DB()) { GetResultImpl(gm, enAtomic, enSequentialWrite); @@ -87,6 +88,25 @@ public: } } + __aicore__ inline LocalTensor SplitBias(const int32_t dataLen) + { + if constexpr (MatmulFeatureTrait().IsSupportMNL0DB()) { + auto bias = MATMUL_MODULE(BiasScheduler)->CopyIn(MATMUL_MODULE(NLoop)->GetBaseShape(), 1, + MATMUL_MODULE(NLoop)->GetInnerIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + // ORDER_N(M db): split load bias here, ORDER_M(N db): split load bias in each dbLoop + if constexpr(ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_N) { + MATMUL_MODULE(BiasScheduler)->SplitLoad(bias, dataLen); + } + return bias; + } else { + auto bias = MATMUL_MODULE(BiasScheduler)->CopyIn(MATMUL_MODULE(NLoop)->GetBaseShape(), 1, + MATMUL_MODULE(NLoop)->GetInnerIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + MATMUL_MODULE(BiasScheduler)->SplitLoad(bias, dataLen); + MATMUL_MODULE(BiasScheduler)->Free(bias); + return {}; + } + } + private: __aicore__ inline void ReduceKOneIter(bool enPartialSum) { @@ -95,8 +115,8 @@ private: SplitParams bL0Params = BASE_MODULE::InitSplitBParams(); MATMUL_MODULE(KLoop)->OuterStart(); // CopyIn - LocalTensor a1; - LocalTensor b1; + LocalTensor a1; + LocalTensor b1; BASE_MODULE::CopyIn(a1, b1); bool isATranspose = MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); bool isBTranspose = MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); @@ -143,8 +163,8 @@ private: int32_t curKbOuterIdx = MATMUL_MODULE(KLoop)->GetOuterKbIdx(); do { // CopyIn - LocalTensor a1; - LocalTensor b1; + LocalTensor a1; + LocalTensor b1; BASE_MODULE::CopyIn(a1, b1); LocalTensor bias = SplitBias(bL0Params.axisL0Len); Compute(a1, b1, bias, enPartialSum, isATranspose, isBTranspose, aL0Params, bL0Params); @@ -158,26 +178,7 @@ private: MATMUL_MODULE(PartialOutUtils)->Destroy(); } - __aicore__ inline LocalTensor SplitBias(const int32_t dataLen) - { - if constexpr (MatmulFeatureTrait().IsSupportMNL0DB()) { - auto bias = MATMUL_MODULE(BiasScheduler)->CopyIn(MATMUL_MODULE(NLoop)->GetBaseShape(), 1, - MATMUL_MODULE(NLoop)->GetInnerIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); - // ORDER_N(M db): split load bias here, ORDER_M(N db): split load bias in each dbLoop - if constexpr(ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_N) { - MATMUL_MODULE(BiasScheduler)->SplitLoad(bias, dataLen); - } - return bias; - } else { - auto bias = MATMUL_MODULE(BiasScheduler)->CopyIn(MATMUL_MODULE(NLoop)->GetBaseShape(), 1, - MATMUL_MODULE(NLoop)->GetInnerIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); - MATMUL_MODULE(BiasScheduler)->SplitLoad(bias, dataLen); - MATMUL_MODULE(BiasScheduler)->Free(bias); - return {}; - } - } - - __aicore__ inline void Compute(const LocalTensor& a1, const LocalTensor& b1, + __aicore__ inline void Compute(const LocalTensor& a1, const LocalTensor& b1, LocalTensor& bias, const bool enPartialSum, const bool isATranspose, const bool isBTranspose, SplitParams& aL0Params, SplitParams& bL0Params) { @@ -205,13 +206,13 @@ private: } while (MATMUL_MODULE(KLoop)->InnerNext()); } - __aicore__ inline void ComputeKDB(const LocalTensor& a1, const LocalTensor& b1, + __aicore__ inline void ComputeKDB(const LocalTensor& a1, const LocalTensor& b1, const SplitParams& aL0Params, const SplitParams& bL0Params, const bool isATranspose, const bool isBTranspose, const bool sL0CInit, const bool sL0CLast) { MATMUL_MODULE(TBufPoolL0)->Allocate(); - LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); - LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); + LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); + LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); MATMUL_MODULE(TBufPoolL0)->EnQue(); MATMUL_MODULE(TBufPoolL0)->DeQue(); BASE_MODULE::CubeCompute(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), a2, b2, aL0Params.axisL0Len, @@ -220,19 +221,19 @@ private: MATMUL_MODULE(BiasScheduler)->Free(); } - __aicore__ inline void ComputeMDB(const LocalTensor& a1, const LocalTensor& b1, + __aicore__ inline void ComputeMDB(const LocalTensor& a1, const LocalTensor& b1, LocalTensor& bias, SplitParams& aL0Params, SplitParams& bL0Params, bool isATranspose, - bool isBTranspose, bool sL0CInit, bool sL0CLast) + bool isBTranspose, bool sL0CInit, bool sL0CLast) { // Split b2 - LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); + LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); uint32_t l0aDBLoop = MATMUL_MODULE(MLoop)->GetL0DBLoopNum(); for (uint32_t idx = 0; idx < l0aDBLoop; ++idx) { // allocate L0 buffer MATMUL_MODULE(TBufPoolL0)->Allocate(); // Split a2 aL0Params.axisL1Offset += (idx * aL0Params.axisL0Len); - LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); + LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); MATMUL_MODULE(TBufPoolL0)->EnQue(); MATMUL_MODULE(TBufPoolL0)->DeQue(); // prepare params and compute @@ -245,18 +246,18 @@ private: MATMUL_MODULE(BiasScheduler)->Free(); } - __aicore__ inline void ComputeNDB(const LocalTensor& a1, const LocalTensor& b1, LocalTensor& bias, - SplitParams& aL0Params, SplitParams& bL0Params, bool isATranspose, bool isBTranspose, bool sL0CInit, bool sL0CLast) + __aicore__ inline void ComputeNDB(const LocalTensor& a1, const LocalTensor& b1, LocalTensor& bias, + SplitParams& aL0Params, SplitParams& bL0Params, bool isATranspose, bool isBTranspose, bool sL0CInit, bool sL0CLast) { // Split a2 - LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); + LocalTensor a2 = BASE_MODULE::SplitA(a1, aL0Params, isATranspose); uint32_t l0bDBLoop = MATMUL_MODULE(NLoop)->GetL0DBLoopNum(); for (uint32_t idx = 0; idx < l0bDBLoop; ++idx) { // allocate L0 buffer MATMUL_MODULE(TBufPoolL0)->Allocate(); // Split b2 bL0Params.axisL1Offset += (idx * bL0Params.axisL0Len); - LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); + LocalTensor b2 = BASE_MODULE::SplitB(b1, bL0Params, isBTranspose); MATMUL_MODULE(TBufPoolL0)->EnQue(); MATMUL_MODULE(TBufPoolL0)->DeQue(); // load bias @@ -276,18 +277,18 @@ private: { aL0Params.kAxisL1Offset = 0; bL0Params.kAxisL1Offset = 0; - if constexpr (PhyPosIsL1(A_TYPE::pos) && IsStaticPaddingEnable(MM_CFG)) { + if constexpr (PhyPosIsL1OrUB((A_TYPE::pos)) && IsStaticPaddingEnable(MM_CFG)) { aL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); - } else if constexpr (PhyPosIsL1(A_TYPE::pos) && IsFullStaticTiling(MM_CFG)) { + } else if constexpr (PhyPosIsL1OrUB((A_TYPE::pos)) && (IsFullStaticTiling(MM_CFG) || MatmulFeatureTrait::IsSupportUBToL1Singleshape())) { aL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgKa() != -1 ? MATMUL_MODULE(MatmulShapeInfo)->GetOrgKa() : MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); } else { aL0Params.kAxisL1Len = MATMUL_MODULE(KLoop)->GetTileBlockShapeA() * BASE_MODULE::c0Size_; } - if constexpr (PhyPosIsL1(B_TYPE::pos) && IsStaticPaddingEnable(MM_CFG)) { + if constexpr (PhyPosIsL1OrUB((B_TYPE::pos)) && IsStaticPaddingEnable(MM_CFG)) { bL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); - } else if constexpr (PhyPosIsL1(B_TYPE::pos) && IsFullStaticTiling(MM_CFG)) { + } else if constexpr (PhyPosIsL1OrUB((B_TYPE::pos)) && (IsFullStaticTiling(MM_CFG) || MatmulFeatureTrait::IsSupportUBToL1Singleshape())) { bL0Params.kAxisL1Len = MATMUL_MODULE(MatmulShapeInfo)->GetOrgKb() != -1 ? MATMUL_MODULE(MatmulShapeInfo)->GetOrgKb() : MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); @@ -326,9 +327,9 @@ private: int32_t curM, int32_t curN, uint8_t enAtomic, bool enSequentialWrite) { if constexpr (C_TYPE::format != CubeFormat::ND && C_TYPE::format != CubeFormat::ND_ALIGN && - C_TYPE::format != CubeFormat::NZ) { + C_TYPE::format != CubeFormat::NZ && C_TYPE::format != CubeFormat::COLUMN_MAJOR) { ASCENDC_ASSERT((false), { - KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN or NZ."); }); + KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN, COLUMN_MAJOR or NZ."); }); } if (enAtomic == ATOMIC_ADD) { SetAtomicAdd(); diff --git a/impl/matmul/scheduler/base/scheduler_norm.h b/impl/matmul/scheduler/base/scheduler_norm.h index ed784500..21f0c794 100644 --- a/impl/matmul/scheduler/base/scheduler_norm.h +++ b/impl/matmul/scheduler/base/scheduler_norm.h @@ -33,7 +33,8 @@ template class MatmulScheduler || - isSingleLargeBMM || IsBasicBlockEnable) && !MatmulFeatureTrait().IsSupportMNL0DB()>> + isSingleLargeBMM || IsBasicBlockEnable) && !MatmulFeatureTrait().IsSupportMNL0DB() + && !isMxMatmul>> : public MatmulNormSchedulerBase { MATMUL_USE_MODULE(MLoop); @@ -50,9 +51,11 @@ class MatmulScheduler::Type; + using TransAT = typename A_TYPE::T; + using TransBT = typename decltype(GetTransBDataType())::T; + using L0cT = typename GetMmDstType::Type; public: using BASE_MODULE = @@ -94,11 +97,15 @@ private: // start K outer loop MATMUL_MODULE(KLoop)->OuterStart(); // CopyIn - LocalTensor a1 = MATMUL_MODULE(CopyCubeInA)->LoadData( + LocalTensor a1 = MATMUL_MODULE(CopyCubeInA)->LoadData( 0, 0, MATMUL_MODULE(MLoop)->GetTileShape(), MATMUL_MODULE(KLoop)->GetTileShapeA()); - LocalTensor b1 = MATMUL_MODULE(CopyCubeInB)->LoadData( + LocalTensor b1 = MATMUL_MODULE(CopyCubeInB)->LoadData( 0, 0, MATMUL_MODULE(KLoop)->GetTileShapeB(), MATMUL_MODULE(NLoop)->GetTileShape()); + if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + MATMUL_MODULE(MatmulCrossCoreSync)->WaitL1Ready(); + } + BASE_MODULE::SplitBias(bL0Params.axisL0Len); // prepare for Split @@ -107,8 +114,8 @@ private: // update some params in SplitParams which is related to k loop BASE_MODULE::SplitPrepare(isATranspose, isBTranspose, aL0Params, bL0Params); MATMUL_MODULE(TBufPoolL0)->Allocate(); - LocalTensor a2 = SplitA(a1, aL0Params, isATranspose); - LocalTensor b2 = SplitB(b1, bL0Params, isBTranspose); + LocalTensor a2 = SplitA(a1, aL0Params, isATranspose); + LocalTensor b2 = SplitB(b1, bL0Params, isBTranspose); MATMUL_MODULE(TBufPoolL0)->EnQue(); MATMUL_MODULE(TBufPoolL0)->DeQue(); @@ -131,12 +138,17 @@ private: do { int32_t kOuterIdx = MATMUL_MODULE(KLoop)->GetOuterIdx(); // CopyIn - LocalTensor a1 = MATMUL_MODULE(CopyCubeInA)->LoadData( + LocalTensor a1 = MATMUL_MODULE(CopyCubeInA)->LoadData( MATMUL_MODULE(MLoop)->GetInnerIdx(), kOuterIdx, MATMUL_MODULE(MLoop)->GetTileShape(), MATMUL_MODULE(KLoop)->GetTileShapeA()); - LocalTensor b1 = MATMUL_MODULE(CopyCubeInB)->LoadData( + LocalTensor b1 = MATMUL_MODULE(CopyCubeInB)->LoadData( kOuterIdx, MATMUL_MODULE(NLoop)->GetInnerIdx(), MATMUL_MODULE(KLoop)->GetTileShapeB(), MATMUL_MODULE(NLoop)->GetTileShape()); + + if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { + MATMUL_MODULE(MatmulCrossCoreSync)->WaitL1Ready(); + } + // update some params in SplitParams which is related to k loop bool sL0CInit = false; bool sL0CLast = false; @@ -147,8 +159,8 @@ private: BASE_MODULE::SplitPrepare(isATranspose, isBTranspose, aL0Params, bL0Params); // allocate L0 buffer MATMUL_MODULE(TBufPoolL0)->Allocate(); - LocalTensor a2 = SplitA(a1, aL0Params, isATranspose); - LocalTensor b2 = SplitB(b1, bL0Params, isBTranspose); + LocalTensor a2 = SplitA(a1, aL0Params, isATranspose); + LocalTensor b2 = SplitB(b1, bL0Params, isBTranspose); MATMUL_MODULE(TBufPoolL0)->EnQue(); MATMUL_MODULE(TBufPoolL0)->DeQue(); // prepare params and compute @@ -162,41 +174,41 @@ private: private: - __aicore__ inline LocalTensor SplitA(const LocalTensor& a1, + __aicore__ inline LocalTensor SplitA(const LocalTensor& a1, const SplitParams& aL0Params, const bool isATranspose) { auto posA = MATMUL_MODULE(MLoop)->GetInnerIdx() * MATMUL_MODULE(KLoop)->GetTotalIter() + MATMUL_MODULE(KLoop)->GetInnerIdx(); - LocalTensor a2; + LocalTensor a2; // Split if (!(MATMUL_MODULE(TBufPoolL0)->template Hit(posA))) { - a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToA2)->Load(a2, a1, aL0Params.axisL1Len, aL0Params.kAxisL1Len, aL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetTileShapeA(), aL0Params.axisL1Offset, aL0Params.kAxisL1Offset, isATranspose); } else { - a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + a2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); } return a2; } - __aicore__ inline LocalTensor SplitB(const LocalTensor& b1, + __aicore__ inline LocalTensor SplitB(const LocalTensor& b1, const SplitParams& bL0Params, const bool isBTranspose) { auto posB = MATMUL_MODULE(NLoop)->GetInnerIdx() * MATMUL_MODULE(KLoop)->GetTotalIter() + MATMUL_MODULE(KLoop)->GetInnerIdx(); - LocalTensor b2; + LocalTensor b2; if (!(MATMUL_MODULE(TBufPoolL0)->template Hit(posB))) { - b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); MATMUL_MODULE(LoadToB2)->Load(b2, b1, bL0Params.axisL1Len, bL0Params.kAxisL1Len, bL0Params.axisL0Len, MATMUL_MODULE(KLoop)->GetTileShapeA(), bL0Params.axisL1Offset, bL0Params.kAxisL1Offset, isBTranspose); } else { - b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); + b2 = MATMUL_MODULE(TBufPoolL0)->template GetBuffer(); } return b2; } - __aicore__ inline void CubeCompute(const LocalTensor& cMatrix, const LocalTensor& a2, - const LocalTensor& b2, const uint16_t madM, const uint16_t madN, const uint16_t madK, + __aicore__ inline void CubeCompute(const LocalTensor& cMatrix, const LocalTensor& a2, + const LocalTensor& b2, const uint16_t madM, const uint16_t madN, const uint16_t madK, const bool isATranspose, const bool isBTranspose, const bool enPartialSum, const bool sL0CInit, const bool sL0CLast) { uint8_t unitFlag = MATMUL_MODULE(MatmulUnitFlag)->GetUnitFlag(sL0CLast); -- Gitee From 4f7c52d6123157ce09043743db2cd47c437d2ff1 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:54:34 +0000 Subject: [PATCH 17/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/base/scheduler_norm_base.h | 10 +++++----- .../scheduler/base/scheduler_norm_outer_product.h | 6 +++--- impl/matmul/scheduler/base/scheduler_special_mdl.h | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/impl/matmul/scheduler/base/scheduler_norm_base.h b/impl/matmul/scheduler/base/scheduler_norm_base.h index 7a1b1290..f12635fe 100644 --- a/impl/matmul/scheduler/base/scheduler_norm_base.h +++ b/impl/matmul/scheduler/base/scheduler_norm_base.h @@ -202,7 +202,7 @@ protected: aL0Params.axisL0Len = GetFixedMadM(aL0Params.axisL0Len); aL0Params.kAxisL1Offset = 0; // if input is from L1, update related params - if constexpr (PhyPosIsL1(A_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(A_TYPE::pos)) { if constexpr (IsBasic(MM_CFG)) { aL0Params.axisL1Offset = 0; } else { @@ -244,7 +244,7 @@ protected: } bL0Params.kAxisL1Offset = 0; // if input is from L1, update related params - if constexpr (PhyPosIsL1(B_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(B_TYPE::pos)) { if constexpr (IsBasic(MM_CFG)) { bL0Params.axisL1Offset = 0; } else { @@ -278,7 +278,7 @@ protected: __aicore__ inline void UpdateSplitParams(SplitParams& aL0Params, SplitParams& bL0Params) { // update Split params related to K loop - if constexpr (PhyPosIsL1(A_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(A_TYPE::pos)) { aL0Params.kAxisL1Offset = MATMUL_MODULE(KLoop)->GetOuterIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); } else if constexpr (!IsStaticPaddingEnable(MM_CFG)) { @@ -288,7 +288,7 @@ protected: aL0Params.kAxisL1Len = MATMUL_MODULE(KLoop)->GetTileBlockShapeA() * c0Size_; } } - if constexpr (PhyPosIsL1(B_TYPE::pos)) { + if constexpr (PhyPosIsL1OrUB(B_TYPE::pos)) { bL0Params.kAxisL1Offset = MATMUL_MODULE(KLoop)->GetOuterIdx() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); } else if constexpr (!IsStaticPaddingEnable(MM_CFG)) { @@ -371,4 +371,4 @@ protected: } // namespace Impl } // namespace AscendC -#endif \ No newline at end of file +#endif diff --git a/impl/matmul/scheduler/base/scheduler_norm_outer_product.h b/impl/matmul/scheduler/base/scheduler_norm_outer_product.h index 55d35113..15020d95 100644 --- a/impl/matmul/scheduler/base/scheduler_norm_outer_product.h +++ b/impl/matmul/scheduler/base/scheduler_norm_outer_product.h @@ -53,7 +53,7 @@ class MatmulScheduler::Type; + using L0cT = typename GetMmDstType::Type; public: using BASE_MODULE = @@ -337,9 +337,9 @@ private: int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { if constexpr (C_TYPE::format != CubeFormat::ND && C_TYPE::format != CubeFormat::ND_ALIGN && - C_TYPE::format != CubeFormat::NZ) { + C_TYPE::format != CubeFormat::NZ && C_TYPE::format != CubeFormat::COLUMN_MAJOR) { ASCENDC_ASSERT((false), { - KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN or NZ."); }); + KERNEL_LOG(KERNEL_ERROR, "Data format of C matrix should be ND, ND_ALIGN ,COLUMN_MAJOR or NZ."); }); } BASE_MODULE::SetAtomic(enAtomic); if (enSequentialWrite) { diff --git a/impl/matmul/scheduler/base/scheduler_special_mdl.h b/impl/matmul/scheduler/base/scheduler_special_mdl.h index 610a393c..0d0d8707 100644 --- a/impl/matmul/scheduler/base/scheduler_special_mdl.h +++ b/impl/matmul/scheduler/base/scheduler_special_mdl.h @@ -104,9 +104,9 @@ private: __aicore__ void GetResultImpl(const GlobalTensor& gm, uint8_t enAtomic, bool enSequentialWrite) { if constexpr (C_TYPE::format != CubeFormat::ND && C_TYPE::format != CubeFormat::ND_ALIGN && - C_TYPE::format != CubeFormat::NZ) { + C_TYPE::format != CubeFormat::NZ && C_TYPE::format != CubeFormat::COLUMN_MAJOR) { ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, - "Data format of C matrix should be ND, ND_ALIGN or NZ."); }); + "Data format of C matrix should be ND, ND_ALIGN, COLUMN_MAJOR or NZ."); }); } // remove dependency conflicts only for scene which is not db auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor(); @@ -197,4 +197,4 @@ private: } // namespace Impl } // namespace AscendC -#endif \ No newline at end of file +#endif -- Gitee From 47a94221d8070dffa5c671949b3aa3f686422f77 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:56:27 +0000 Subject: [PATCH 18/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/batch/batch_scheduler.h | 57 +++++++++++---- .../scheduler/batch/batch_scheduler_base.h | 70 +++++++++++-------- 2 files changed, 83 insertions(+), 44 deletions(-) diff --git a/impl/matmul/scheduler/batch/batch_scheduler.h b/impl/matmul/scheduler/batch/batch_scheduler.h index 809cbb79..ce933692 100644 --- a/impl/matmul/scheduler/batch/batch_scheduler.h +++ b/impl/matmul/scheduler/batch/batch_scheduler.h @@ -29,8 +29,8 @@ namespace Detail { template class BatchScheduler::IsNeedUB() && DoMatmulNorm(MM_CFG) && - ((A_TYPE::layout != LayoutMode::NONE && ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LESS_THAN_L1) || - (A_TYPE::layout == LayoutMode::NORMAL && ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LARGE_THAN_L1))>> + (A_TYPE::layout != LayoutMode::NONE && (A_TYPE::layout != LayoutMode::NORMAL || + ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1))>> : public BatchSchedulerBase { MATMUL_USE_MODULE(BatchLoop); @@ -60,24 +60,24 @@ public: __aicore__ inline BatchScheduler() = default; __aicore__ inline ~BatchScheduler() = default; - __aicore__ inline void Schedule(const LocalTensor& dst, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, - const uint32_t matrixStrideA, const uint32_t matrixStrideB, const uint32_t matrixStrideC) - {} - - __aicore__ inline void Schedule(const GlobalTensor& dst, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, + template + __aicore__ inline void Schedule(const T& dst, bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA, const uint32_t matrixStrideB, const uint32_t matrixStrideC) { // loop unrelated calculation MATMUL_MODULE(BiasScheduler)->SetBias(MATMUL_MODULE(BiasScheduler)->IsBias() && !enPartialSum); auto batchOffsetInfo = PrepareOffset(); auto ctx = BASE_MODULE::PrepareContext(); + const auto batchLoop = MATMUL_MODULE(BatchLoop); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + batchLoop->SetBatchOutCacheNum(0); + batchLoop->SetBatchOutOffsetNum(0); + } LocalTensor bias; // load bias to l1 if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) { bias = MATMUL_MODULE(BiasScheduler)->CopyIn(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN()); } - - const auto batchLoop = MATMUL_MODULE(BatchLoop); for (batchLoop->OuterStart(); !batchLoop->OuterEnd(); batchLoop->OuterNext()) { if constexpr (ToMatmulConfig(MM_CFG).isBiasBatch) { bias = MATMUL_MODULE(BiasScheduler)->CopyIn( @@ -102,15 +102,15 @@ public: if (batchOffsetInfo.setBiasFlag && (batchLoop->GetBatchIndex() % batchOffsetInfo.divisorBias == 1)) { MATMUL_MODULE(BiasScheduler)->StopBias(bias); } - UpdateOffset(batchOffsetInfo, ctx); + UpdateOffset(batchOffsetInfo, ctx); while (BASE_MODULE::MoveNext()) { // iterate MATMUL_MODULE(CubeOutBuffer)->AllocTensor(); ComputeBatch(a1, b1, bias, enPartialSum, ctx); - BASE_MODULE::GetBatchResult(dst[batchLoop->GetDstOffset()], ctx, enAtomic, - enSequentialWrite); + BatchScheduler::GetBatchResultImpl(dst, ctx, enAtomic, enSequentialWrite); SetFlag(eventIDMToMte1); WaitFlag(eventIDMToMte1); } + EndIterate(); } BASE_MODULE::End(); } @@ -124,7 +124,7 @@ public: if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) { MATMUL_MODULE(BiasScheduler)->Destroy(bias); - } + } } private: @@ -147,6 +147,9 @@ private: (batchIndex % batchOffsetInfo.modB + batchIndex / batchOffsetInfo.divisorB); ctx.offsetBias = batchOffsetInfo.alignBias * (batchIndex % batchOffsetInfo.modBias + batchIndex / batchOffsetInfo.divisorBias); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + MATMUL_MODULE(BatchLoop)->SetBatchOutCacheNum(MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum() + 1); + } } __aicore__ inline void ComputeBatch(LocalTensor& a1, LocalTensor& b1, LocalTensor& bias, @@ -321,6 +324,34 @@ private: MATMUL_MODULE(BiasScheduler)->Free(); } } + + template + __aicore__ inline void GetBatchResultImpl(const T &dst, const BatchSchedulerContext& ctx, + uint8_t enAtomic, bool enSequentialWrite) + { + const auto batchLoop = MATMUL_MODULE(BatchLoop); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + if (batchLoop->IsNeedNBatchOut()) { + BASE_MODULE::GetBatchResult(dst, ctx, enAtomic, enSequentialWrite); + } else { + auto co1Local = MATMUL_MODULE(CubeOutBuffer)->GetTensor(); + MATMUL_MODULE(CubeOutBuffer)->FreeTensor(co1Local); + } + } else { + BASE_MODULE::GetBatchResult(dst[batchLoop->GetDstOffset()], ctx, enAtomic, enSequentialWrite); + } + } + + __aicore__ inline void EndIterate() + { + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + const auto batchLoop = MATMUL_MODULE(BatchLoop); + if (batchLoop->IsNeedNBatchOut()) { + batchLoop->SetBatchOutOffsetNum(batchLoop->GetBatchOutOffsetNum() + batchLoop->GetBatchOutCacheNum()); + batchLoop->SetBatchOutCacheNum(0); + } + } + } }; } // namespace Detail diff --git a/impl/matmul/scheduler/batch/batch_scheduler_base.h b/impl/matmul/scheduler/batch/batch_scheduler_base.h index 02518971..db64ed2d 100644 --- a/impl/matmul/scheduler/batch/batch_scheduler_base.h +++ b/impl/matmul/scheduler/batch/batch_scheduler_base.h @@ -32,7 +32,7 @@ class BatchSchedulerBase using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; MATMUL_USE_MODULE(MLoop); MATMUL_USE_MODULE(NLoop); @@ -91,7 +91,11 @@ public: lenFactor = DOUBLE_SIZE; } - MATMUL_MODULE(CubeOutBuffer)->Init(tiling.GetBaseM() * tiling.GetBaseN(), lenFactor); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + MATMUL_MODULE(CubeOutBuffer)->Init(tiling.GetBaseM() * tiling.GetBaseN() * nBatchOutNum_, lenFactor); + } else { + MATMUL_MODULE(CubeOutBuffer)->Init(tiling.GetBaseM() * tiling.GetBaseN(), lenFactor); + } if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LARGE_THAN_L1) { MATMUL_MODULE(BiasScheduler)->Init(MATMUL_MODULE(BatchLoop)->GetBatchNum()); } else if constexpr (ToMatmulConfig(MM_CFG).batchMode == BatchMode::BATCH_LESS_THAN_L1) { @@ -338,7 +342,13 @@ public: bool cmatrixSource; bool cmatrixInitVal; UpdateMmadComputeParams(sL0CInit, cmatrixSource, cmatrixInitVal); - MATMUL_MODULE(MmadCompute)->Compute(MATMUL_MODULE(CubeOutBuffer)->GetTensor(), a2, b2, + int32_t offsetC = 0; + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + offsetC = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * + (MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum() - 1); + } + MATMUL_MODULE(MmadCompute)->Compute(MATMUL_MODULE(CubeOutBuffer)->GetTensor()[offsetC], a2, b2, ctx.aL0Params.axisL0Len, kL0Len, ctx.bL0Params.axisL0Len, isATranspose, isBTranspose, MATMUL_MODULE(MatmulUnitFlag)->GetUnitFlag(sL0CLast), cmatrixSource, cmatrixInitVal, false); bufferPool.Free(); @@ -356,22 +366,19 @@ public: } // Copyout related - __aicore__ inline void GetBatchResult(const GlobalTensor &cGlobal, const BatchSchedulerContext& ctx, + template + __aicore__ inline void GetBatchResult(const T &dst, const BatchSchedulerContext& ctx, uint8_t enAtomic, bool enSequentialWriteIn) { // supports continuous, discontinuous and reduce transfer on the GM. (three layout types are supported) - uint32_t iBatch = ctx.isReduceG ? (MATMUL_MODULE(BatchLoop)->GetBatchIndex() / ctx.reduceGNum) : - MATMUL_MODULE(BatchLoop)->GetBatchIndex(); if (ctx.isReduceG) { SetAtomicAdd(); } - if ((C_TYPE::layout == LayoutMode::BSNGD) || (C_TYPE::layout == LayoutMode::SBNGD)) { - ASSERT(enSequentialWriteIn == false && "Layout BSNGD or SBNGD can not be SequentialWrite"); - } - // Scenario 1: Continuous copy + int32_t stride = 0; const auto tiling = MATMUL_MODULE(MatmulShapeTiling)->GetTiling(); if constexpr (C_TYPE::layout == LayoutMode::BNGS1S2 || C_TYPE::layout == LayoutMode::NORMAL) { + // Scenario 1: Continuous copy int32_t alignedSingleCoreN = CeilAlign(tiling.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(DstT))); if constexpr (PhyPosIsGM(C_TYPE::pos)) { alignedSingleCoreN = tiling.GetSingleCoreN(); @@ -379,15 +386,21 @@ public: if constexpr (C_TYPE::format == CubeFormat::NZ) { alignedSingleCoreN = CeilAlign(tiling.GetSingleCoreN(), BLOCK_CUBE); } - uint64_t offset = iBatch * tiling.GetSingleCoreM() * alignedSingleCoreN; - CopyOut(cGlobal[offset], enAtomic, enSequentialWriteIn); - } else { + stride = tiling.GetSingleCoreM() * alignedSingleCoreN; + } else if constexpr (C_TYPE::layout == LayoutMode::BSNGD || C_TYPE::layout == LayoutMode::SBNGD) { + ASSERT(enSequentialWriteIn == false && "Layout BSNGD or SBNGD can not be SequentialWrite"); // Scenario 2: disconsecutive copy - if constexpr (!(C_TYPE::layout == LayoutMode::BSNGD || C_TYPE::layout == LayoutMode::SBNGD)) { - ASSERT(false && "Can not support other Layout"); - } - uint64_t offset = iBatch * tiling.GetSingleCoreN(); - CopyOut(cGlobal[offset], enAtomic, enSequentialWriteIn); + stride = tiling.GetSingleCoreN(); + } else { + ASSERT(false && "Can not support other Layout"); + } + + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + CopyOut(dst[MATMUL_MODULE(BatchLoop)->GetBatchOutOffsetNum() * stride], enAtomic, enSequentialWriteIn); + } else { + uint32_t iBatch = ctx.isReduceG ? (MATMUL_MODULE(BatchLoop)->GetBatchIndex() / ctx.reduceGNum) : + MATMUL_MODULE(BatchLoop)->GetBatchIndex(); + CopyOut(dst[iBatch * stride], enAtomic, enSequentialWriteIn); } if (ctx.isReduceG) { @@ -395,15 +408,6 @@ public: } } - __aicore__ inline void GetBatchResult(const LocalTensor &dst, const BatchSchedulerContext& ctx, - uint8_t enAtomic, bool enSequentialWrite) - { - uint64_t offset = MATMUL_MODULE(BatchLoop)->GetBatchIndex() * MATMUL_MODULE(MatmulShapeTiling)-> - GetTiling().GetSingleCoreM() * CeilAlign(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN(), - AscendCUtils::GetC0Count(sizeof(DstT))); - CopyOut(dst[offset], enAtomic, enSequentialWrite); - } - __aicore__ inline void End() { MATMUL_MODULE(BatchCopyCubeInA)->Destroy(); @@ -412,16 +416,19 @@ public: MATMUL_MODULE(CubeOutBuffer)->Destroy(); } + __aicore__ inline void SetNBatchOutNum(int32_t nBatchOutNum) + { + nBatchOutNum_ = nBatchOutNum; + } private: __aicore__ inline void InitL0Params(BatchSchedulerContext& ctx) { const auto matmulShapeInfo = MATMUL_MODULE(MatmulShapeInfo); ctx.aL0Params.axisL1Len = CeilAlign(matmulShapeInfo->GetSingleCoreM(), BLOCK_CUBE); - ctx.aL0Params.kAxisL1Len = matmulShapeInfo->IsTransposeA() ? CeilAlign( + ctx.aL0Params.kAxisL1Len = (matmulShapeInfo->IsTransposeA() && !IsSuppportB8()) ? CeilAlign( matmulShapeInfo->GetSingleCoreK(), BLOCK_CUBE) : CeilAlign(matmulShapeInfo->GetSingleCoreK(), c0Size_); - ctx.bL0Params.axisL1Len = CeilAlign(matmulShapeInfo->GetSingleCoreN(), BLOCK_CUBE); - ctx.bL0Params.kAxisL1Len = matmulShapeInfo->IsTransposeB() ? CeilAlign( + ctx.bL0Params.kAxisL1Len = (matmulShapeInfo->IsTransposeB() || IsSuppportB8()) ? CeilAlign( matmulShapeInfo->GetSingleCoreK(), c0Size_) : CeilAlign(matmulShapeInfo->GetSingleCoreK(), BLOCK_CUBE); } @@ -524,7 +531,7 @@ private: __aicore__ inline void GetTensorCImpl(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth,uint8_t enAtomic, bool enSequentialWrite) + int32_t baseBlockWidth, uint8_t enAtomic, bool enSequentialWrite) { SetAtomic(enAtomic); // remove dependency conflicts only for scene which is not db GetTensorCImplCore(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, @@ -578,6 +585,7 @@ public: private: constexpr static int32_t c0Size_ = AuxGetC0Size(); + int32_t nBatchOutNum_; }; } // namespace Detail } // namespace Impl -- Gitee From d61281fe00eb924cdd4cff4459c8e06caca50864 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 02:59:15 +0000 Subject: [PATCH 19/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/bias/bias_scheduler_base.h | 4 ++-- impl/matmul/scheduler/bias/bias_scheduler_intf.h | 4 ++-- impl/matmul/scheduler/bias/bias_scheduler_v220.h | 8 +++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/impl/matmul/scheduler/bias/bias_scheduler_base.h b/impl/matmul/scheduler/bias/bias_scheduler_base.h index aaf50955..295aaee8 100644 --- a/impl/matmul/scheduler/bias/bias_scheduler_base.h +++ b/impl/matmul/scheduler/bias/bias_scheduler_base.h @@ -32,8 +32,8 @@ class BiasSchedulerBase { MATMUL_USE_MODULE(MatmulShapeTiling); using BiasT = typename BIAS_TYPE::T; - using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || !MatmulFeatureTrait::IsSupportUBToL1()), - GlobalTensor, LocalTensor>::type; + using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || (!MatmulFeatureTrait::IsSupportUBToL1() + && !MatmulFeatureTrait::IsSupportUBToL1Singleshape())), GlobalTensor, LocalTensor>::type; public: __aicore__ inline BiasSchedulerBase() = default; diff --git a/impl/matmul/scheduler/bias/bias_scheduler_intf.h b/impl/matmul/scheduler/bias/bias_scheduler_intf.h index b32bf64b..40ff3fa2 100644 --- a/impl/matmul/scheduler/bias/bias_scheduler_intf.h +++ b/impl/matmul/scheduler/bias/bias_scheduler_intf.h @@ -28,8 +28,8 @@ namespace Detail { template class BiasScheduler { using BiasT = typename BIAS_TYPE::T; - using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || !MatmulFeatureTrait::IsSupportUBToL1()), - GlobalTensor, LocalTensor>::type; + using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || (!MatmulFeatureTrait::IsSupportUBToL1() + && !MatmulFeatureTrait::IsSupportUBToL1Singleshape())), GlobalTensor, LocalTensor>::type; public: __aicore__ inline BiasScheduler() = default; diff --git a/impl/matmul/scheduler/bias/bias_scheduler_v220.h b/impl/matmul/scheduler/bias/bias_scheduler_v220.h index 9598640c..5042e259 100644 --- a/impl/matmul/scheduler/bias/bias_scheduler_v220.h +++ b/impl/matmul/scheduler/bias/bias_scheduler_v220.h @@ -33,7 +33,7 @@ class BiasScheduler::IsNeedUB() && ToMatmulConfig(MM_CFG).enableSetBias && (A_TYPE::layout == LayoutMode::NONE || ToMatmulConfig(MM_CFG).batchMode == BatchMode::SINGLE_LARGE_THAN_L1) && - (PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsGM(BIAS_TYPE::pos)) && + (PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsGM(BIAS_TYPE::pos) || PhyPosIsL1(BIAS_TYPE::pos)) && (DoMatmulMDL(MM_CFG) || isNormEnableScheduler || IsBmmEnableScheduler || DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable || DoMatmulIBShareNorm(MM_CFG))>> @@ -70,7 +70,9 @@ public: __aicore__ inline LocalTensor CopyIn(int32_t dataLen, int32_t dataNum = 1, int32_t srcOffset = 0) { LocalTensor biasC1; - if (BASE_MODULE::enableBias_ && MATMUL_MODULE(KLoop)->FirstOuterIter()) { + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + biasC1 = (BASE_MODULE::srcTensor_)[srcOffset]; + } else if (BASE_MODULE::enableBias_ && MATMUL_MODULE(KLoop)->FirstOuterIter()) { biasC1 = MATMUL_MODULE(C1Buffer)->AllocTensor(); MATMUL_MODULE(CopyBiasIn)->Copy(biasC1, BASE_MODULE::srcTensor_, dataLen, dataNum, srcOffset + BASE_MODULE::singleOffset_); @@ -83,7 +85,7 @@ public: __aicore__ inline void Free(LocalTensor &biasC1) { // biasC1 use end, free it - if (BASE_MODULE::enableBias_ && MATMUL_MODULE(KLoop)->FirstOuterIter()) { + if (!PhyPosIsL1(BIAS_TYPE::pos) && BASE_MODULE::enableBias_ && MATMUL_MODULE(KLoop)->FirstOuterIter()) { MATMUL_MODULE(C1Buffer)->FreeTensor(biasC1); } } -- Gitee From 3402a7198bee2ac91adba8d08cdb533a667ea05a Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:00:14 +0000 Subject: [PATCH 20/56] add Signed-off-by: jiangchengcheng-on --- .../iterator/batch_loop/batch_loop_multi.h | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h index 8b9cc583..1a8379e1 100644 --- a/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h +++ b/impl/matmul/scheduler/iterator/batch_loop/batch_loop_multi.h @@ -53,6 +53,11 @@ public: UpdateBatchNumParams(); } + __aicore__ inline void SetNBatchOutNum(int32_t nBatchOutNum) + { + nBatchOutNum_ = nBatchOutNum; + } + // Outer Loop __aicore__ inline void OuterStart() { @@ -169,10 +174,41 @@ public: return innerBatchIdx_; } + __aicore__ inline bool IsNeedNBatchOut() const + { + if (batchOutCacheNum_ == nBatchOutNum_) { + return true; + } else if (batchOutOffsetNum_ + batchOutCacheNum_ == totalBatchNum_) { + return true; + } + return false; + } + + __aicore__ inline int32_t GetBatchOutCacheNum() const + { + return batchOutCacheNum_; + } + + __aicore__ inline void SetBatchOutCacheNum(int32_t cacheNum) + { + batchOutCacheNum_ = cacheNum; + } + + __aicore__ inline int32_t GetBatchOutOffsetNum() const + { + return batchOutOffsetNum_; + } + + __aicore__ inline void SetBatchOutOffsetNum(int32_t offsetNum) + { + batchOutOffsetNum_ = offsetNum; + } + private: __aicore__ inline void CalcBatchNum(int32_t layoutBatchNumA, int32_t layoutBatchNumB, int32_t batchNumA, int32_t batchNumB) { + totalBatchNum_ = batchNumA > batchNumB ? batchNumA : batchNumB; if constexpr (ToMatmulConfig(MM_CFG).batchMode != BatchMode::BATCH_LARGE_THAN_L1) { ASSERT(batchNumA > 0 && batchNumB > 0 && (batchNumA % batchNumB == 0 || batchNumB % batchNumA == 0)); @@ -285,6 +321,12 @@ private: // inner loop params uint32_t innerIdx_; uint32_t innerBatchIdx_; // global view batch index within inner loop + + // nBatchOut params + int32_t totalBatchNum_; + int32_t nBatchOutNum_ = 1; + int32_t batchOutCacheNum_ = 0; + int32_t batchOutOffsetNum_ = 0; }; } // namespace Detail } // namespace Impl -- Gitee From 0df53ddc369c2741b73733c1d4bd9c4bc49d125e Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:01:50 +0000 Subject: [PATCH 21/56] add Signed-off-by: jiangchengcheng-on --- .../matmul/scheduler/iterator/k_loop/k_loop.h | 3 + .../iterator/k_loop/k_loop_intrablock.h | 2 +- .../iterator/k_loop/k_loop_partial_output.h | 77 +++++++++++++++++++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 impl/matmul/scheduler/iterator/k_loop/k_loop_partial_output.h diff --git a/impl/matmul/scheduler/iterator/k_loop/k_loop.h b/impl/matmul/scheduler/iterator/k_loop/k_loop.h index b6447b21..6b496522 100644 --- a/impl/matmul/scheduler/iterator/k_loop/k_loop.h +++ b/impl/matmul/scheduler/iterator/k_loop/k_loop.h @@ -18,5 +18,8 @@ #include "k_loop_mdl.h" #include "k_loop_norm.h" #include "k_loop_intrablock.h" +#if defined(__DAV_C310__) +#include "k_loop_mdl_partial_output.h" +#endif #endif // _K_LOOP_H_ \ No newline at end of file diff --git a/impl/matmul/scheduler/iterator/k_loop/k_loop_intrablock.h b/impl/matmul/scheduler/iterator/k_loop/k_loop_intrablock.h index 92ec051e..81184101 100644 --- a/impl/matmul/scheduler/iterator/k_loop/k_loop_intrablock.h +++ b/impl/matmul/scheduler/iterator/k_loop/k_loop_intrablock.h @@ -94,7 +94,7 @@ public: } template - __aicore__ inline bool FirstOuterIter() const + __aicore__ inline bool FirstOuterIter() const { return GetOuterIdx() == 0; } diff --git a/impl/matmul/scheduler/iterator/k_loop/k_loop_partial_output.h b/impl/matmul/scheduler/iterator/k_loop/k_loop_partial_output.h new file mode 100644 index 00000000..a0f2aeb6 --- /dev/null +++ b/impl/matmul/scheduler/iterator/k_loop/k_loop_partial_output.h @@ -0,0 +1,77 @@ +/** +* Copyright (c) 2025 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file k_loop_mdl_partial_output.h +* \brief partial output, only for aicore like 310 +*/ +#ifndef IMPL_MATMUL_SCHEDULER_ITERATOR_K_LOOP_K_LOOP_MDL_PARTIAL_OUTPUT_H +#define IMPL_MATMUL_SCHEDULER_ITERATOR_K_LOOP_K_LOOP_MDL_PARTIAL_OUTPUT_H + +#include "k_loop_intf.h" +#include "k_loop_mdl_base.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + KLoop is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + KLoop is only for internal usage, does not support extension or customized specialization! +*/ +template +class KLoop::IsSupportL0CToUB() && ToMatmulConfig(MM_CFG).isPartialOutput)>> + : public KLoopMDLBase +{ +public: + using BASE_MODULE = AscendC::Impl::Detail::KLoopMDLBase; + __aicore__ inline KLoop() = default; + __aicore__ inline ~KLoop() = default; + + __aicore__ inline void InnerStart() + { + BASE_MODULE::outerIdx_ = 0; + BASE_MODULE::UpdateOuterParams(); + + BASE_MODULE::innerIdx_ = 0; + BASE_MODULE::UpdateInnerParams(AscendC::Std::integral_constant{}); + } + + __aicore__ inline bool InnerNext() + { + BASE_MODULE::innerIdx_++; + if (InnerEnd()) { + return false; + } else { + // compute outer from inner + BASE_MODULE::outerIdx_ = BASE_MODULE::innerIdx_ / BASE_MODULE::minStepK_; + BASE_MODULE::UpdateOuterParams(); + + BASE_MODULE::UpdateInnerParams(AscendC::Std::integral_constant{}); + return true; + } + } + + __aicore__ inline bool InnerEnd() + { + return BASE_MODULE::innerIdx_ >= BASE_MODULE::kIter_; + } + + __aicore__ inline int32_t GetStepInnerIdx() const + { + // loop index within each step + return BASE_MODULE::innerIdx_ % BASE_MODULE::minStepK_; + } +}; + +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // _K_LOOP_MDL_H_ -- Gitee From 0501ee1c0901d5df6792f0921570c225eb773fe2 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:09:20 +0000 Subject: [PATCH 22/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/iterator/k_loop/k_loop_mdl.h | 4 +++- impl/matmul/scheduler/iterator/k_loop/k_loop_mdl_reorder.h | 3 ++- impl/matmul/scheduler/iterator/m_loop/m_loop.h | 2 +- impl/matmul/scheduler/iterator/m_loop/m_loop_norm.h | 3 +-- .../scheduler/iterator/m_loop/m_loop_norm_outer_product.h | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl.h b/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl.h index d7c69c32..bfcd8628 100644 --- a/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl.h +++ b/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl.h @@ -27,7 +27,9 @@ namespace Detail { */ template class KLoop) || DoMatmulSpecialMDL(MM_CFG)>> + enable_if_t<(DoMatmulMDL(MM_CFG) && !IsKdimReorderLoad && !HasScalePosition::value + && !(MatmulFeatureTrait::IsSupportL0CToUB() && ToMatmulConfig(MM_CFG).isPartialOutput)) + || DoMatmulSpecialMDL(MM_CFG)>> : public KLoopMDLBase { MATMUL_USE_MODULE(MatmulShapeTiling); diff --git a/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl_reorder.h b/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl_reorder.h index 46b1877b..de770752 100644 --- a/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl_reorder.h +++ b/impl/matmul/scheduler/iterator/k_loop/k_loop_mdl_reorder.h @@ -26,7 +26,8 @@ namespace Detail { KLoop is only for internal usage, does not support extension or customized specialization! */ template -class KLoop::IsNeedUB() && DoMatmulMDL(MM_CFG) && IsKdimReorderLoad>> +class KLoop::IsNeedUB() && DoMatmulMDL(MM_CFG) && IsKdimReorderLoad + && !HasScalePosition::value>> : public KLoopMDLBase { MATMUL_USE_MODULE(MatmulShapeTiling); diff --git a/impl/matmul/scheduler/iterator/m_loop/m_loop.h b/impl/matmul/scheduler/iterator/m_loop/m_loop.h index a5cb261b..b457ca45 100644 --- a/impl/matmul/scheduler/iterator/m_loop/m_loop.h +++ b/impl/matmul/scheduler/iterator/m_loop/m_loop.h @@ -19,9 +19,9 @@ #include "m_loop_norm.h" #include "m_loop_batch.h" #include "m_loop_mdl_outer_product.h" +#include "m_loop_batch_db.h" #include "m_loop_basic.h" #include "m_loop_intrablock.h" -#include "m_loop_batch_db.h" #include "m_loop_norm_outer_product.h" #endif // IMPL_MATMUL_SCHEDULER_ITERATOR_M_LOOP_M_LOOP_H diff --git a/impl/matmul/scheduler/iterator/m_loop/m_loop_norm.h b/impl/matmul/scheduler/iterator/m_loop/m_loop_norm.h index d86fcb6f..152018cf 100644 --- a/impl/matmul/scheduler/iterator/m_loop/m_loop_norm.h +++ b/impl/matmul/scheduler/iterator/m_loop/m_loop_norm.h @@ -30,8 +30,7 @@ namespace Detail { */ template class MLoop || DoMatmulIBShareNorm(MM_CFG)) - && !MatmulFeatureTrait().IsSupportMNL0DB()>> : - public MLoopNormBase + && !MatmulFeatureTrait().IsSupportMNL0DB()>> : public MLoopNormBase { MATMUL_USE_MODULE(MatmulShapeTiling); public: diff --git a/impl/matmul/scheduler/iterator/m_loop/m_loop_norm_outer_product.h b/impl/matmul/scheduler/iterator/m_loop/m_loop_norm_outer_product.h index 9564386e..bc6cbe1b 100644 --- a/impl/matmul/scheduler/iterator/m_loop/m_loop_norm_outer_product.h +++ b/impl/matmul/scheduler/iterator/m_loop/m_loop_norm_outer_product.h @@ -28,7 +28,7 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. MLoop is only for internal usage, does not support extension or customized specialization! */ -template +template class MLoop && MatmulFeatureTrait().IsSupportMNL0DB()>> : public MLoopNormBase { -- Gitee From 7e8083d04352b98e55c2c841bdd9fa87ba41e010 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:12:15 +0000 Subject: [PATCH 23/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/iterator/n_loop/n_loop.h | 2 +- impl/matmul/scheduler/iterator/n_loop/n_loop_mdl_base.h | 2 +- impl/matmul/scheduler/iterator/n_loop/n_loop_norm.h | 3 +-- .../scheduler/iterator/n_loop/n_loop_norm_outer_product.h | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/impl/matmul/scheduler/iterator/n_loop/n_loop.h b/impl/matmul/scheduler/iterator/n_loop/n_loop.h index 5474f225..78f1b6dc 100644 --- a/impl/matmul/scheduler/iterator/n_loop/n_loop.h +++ b/impl/matmul/scheduler/iterator/n_loop/n_loop.h @@ -19,9 +19,9 @@ #include "n_loop_norm.h" #include "n_loop_batch.h" #include "n_loop_mdl_outer_product.h" +#include "n_loop_batch_db.h" #include "n_loop_basic.h" #include "n_loop_intrablock.h" -#include "n_loop_batch_db.h" #include "n_loop_norm_outer_product.h" #endif // IMPL_MATMUL_SCHEDULER_ITERATOR_N_LOOP_N_LOOP_H diff --git a/impl/matmul/scheduler/iterator/n_loop/n_loop_mdl_base.h b/impl/matmul/scheduler/iterator/n_loop/n_loop_mdl_base.h index d6061218..5813683b 100644 --- a/impl/matmul/scheduler/iterator/n_loop/n_loop_mdl_base.h +++ b/impl/matmul/scheduler/iterator/n_loop/n_loop_mdl_base.h @@ -27,7 +27,7 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. NLoopMDLBase is only for internal usage, does not support extension or customized specialization! */ -template +template class NLoopMDLBase { MATMUL_USE_MODULE(MatmulShapeTiling); diff --git a/impl/matmul/scheduler/iterator/n_loop/n_loop_norm.h b/impl/matmul/scheduler/iterator/n_loop/n_loop_norm.h index 225f38fb..a78f8f84 100644 --- a/impl/matmul/scheduler/iterator/n_loop/n_loop_norm.h +++ b/impl/matmul/scheduler/iterator/n_loop/n_loop_norm.h @@ -30,8 +30,7 @@ namespace Detail { */ template class NLoop || DoMatmulIBShareNorm(MM_CFG)) - && !MatmulFeatureTrait().IsSupportMNL0DB()>> : - public NLoopNormBase + && !MatmulFeatureTrait().IsSupportMNL0DB()>> : public NLoopNormBase { MATMUL_USE_MODULE(MatmulShapeTiling); public: diff --git a/impl/matmul/scheduler/iterator/n_loop/n_loop_norm_outer_product.h b/impl/matmul/scheduler/iterator/n_loop/n_loop_norm_outer_product.h index 0f8cb7bc..f0e26294 100644 --- a/impl/matmul/scheduler/iterator/n_loop/n_loop_norm_outer_product.h +++ b/impl/matmul/scheduler/iterator/n_loop/n_loop_norm_outer_product.h @@ -28,7 +28,7 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. NLoop is only for internal usage, does not support extension or customized specialization! */ -template +template class NLoop && MatmulFeatureTrait().IsSupportMNL0DB()>> : public NLoopNormBase { -- Gitee From 260183f7822f41518dc1f8a4ff9a7fd0fe5d9f0d Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:17:24 +0000 Subject: [PATCH 24/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/scheduler/scheduler.h | 3 +++ impl/matmul/stage/compute/mmad_compute.h | 4 +++- .../copy_cube_in/antiquant/antiquant_processor_using_ub.h | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/impl/matmul/scheduler/scheduler.h b/impl/matmul/scheduler/scheduler.h index e7254d77..5a63c0af 100644 --- a/impl/matmul/scheduler/scheduler.h +++ b/impl/matmul/scheduler/scheduler.h @@ -27,5 +27,8 @@ #include "base/scheduler_norm_outer_product.h" #include "base/scheduler_special_mdl.h" #include "base/scheduler_n_buffer.h" +#if defined(__DAV_C310__) +#include "base/scheduler_mdl_partial_output.h" +#endif #endif diff --git a/impl/matmul/stage/compute/mmad_compute.h b/impl/matmul/stage/compute/mmad_compute.h index ebbb4fc4..2286392b 100644 --- a/impl/matmul/stage/compute/mmad_compute.h +++ b/impl/matmul/stage/compute/mmad_compute.h @@ -38,7 +38,8 @@ public: if constexpr (MatmulFeatureTrait::IsUnitFlagEnabled()) { mmadParams.unitFlag = unitFlag; } - if constexpr (IsSameType::value && IsSameType::value) { + if constexpr (IsSameType::value && IsSameType::value + && !MatmulFeatureTrait::IsSupportLoad2dV2()) { if (isATrans) { mmadParams.kDirectionAlign = 1; } @@ -51,6 +52,7 @@ public: mmadParams.isBias = isBias; } if constexpr (hasSpIdx) { + // C310 Not support mmadwithsparse MmadWithSparse(cMatrix, l0A, l0B, mmadParams); } else { Mmad(cMatrix, l0A, l0B, mmadParams); diff --git a/impl/matmul/stage/copy_cube_in/antiquant/antiquant_processor_using_ub.h b/impl/matmul/stage/copy_cube_in/antiquant/antiquant_processor_using_ub.h index b70be18c..23d005dc 100644 --- a/impl/matmul/stage/copy_cube_in/antiquant/antiquant_processor_using_ub.h +++ b/impl/matmul/stage/copy_cube_in/antiquant/antiquant_processor_using_ub.h @@ -161,4 +161,4 @@ private: } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_QUANT_QUANT_PROCESSOR_USING_UB_H +#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_QUANT_QUANT_PROCESSOR_USING_UB_H \ No newline at end of file -- Gitee From c39397a6c80241377fb6620d40867ac9ef256fb9 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:18:23 +0000 Subject: [PATCH 25/56] add Signed-off-by: jiangchengcheng-on --- .../copy_cube_in/base/copy_cube_in_atop.h | 129 ++++++++++++++++++ .../copy_cube_in/base/copy_cube_in_base.h | 2 +- .../copy_cube_in/base/copy_cube_in_from_l1.h | 2 +- 3 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 impl/matmul/stage/copy_cube_in/base/copy_cube_in_atop.h diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_atop.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_atop.h new file mode 100644 index 00000000..54e18324 --- /dev/null +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_atop.h @@ -0,0 +1,129 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! +* \file copy_cube_in_atop.h +* \brief +*/ + +#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_IN_BASE_COPY_CUBE_IN_ATOP_H +#define IMPL_MATMUL_STAGE_COPY_CUBE_IN_BASE_COPY_CUBE_IN_ATOP_H + +#include "../copy_tile_to_cube/copy_tile_to_cube.h" +#include "copy_cube_in_intf.h" +#include "copy_cube_in_base.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template class CopyIn> +class CopyCubeIn + : public CopyCubeInBase +{ + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + using BASE_MODULE = AscendC::Impl::Detail::CopyCubeInBase; + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + + template + __aicore__ inline LocalTensor LoadData(int32_t curRow, int32_t curCol, int32_t tileHeight, + int32_t tileWidth, const ScheduleContext &context = 0) + { + LocalTensor l1; + int32_t posL1 = BASE_MODULE::GetIterIndex(curRow, curCol); + int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { + l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); + } else { + l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); + CopyTensor(l1, curRow, curCol, tileHeight, tileWidth); + MATMUL_MODULE(CubeInBuffer)->EnQue(l1); + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + return l1; + } + + template + __aicore__ inline LocalTensor AsyncLoadData(int32_t curRow, int32_t curCol, int32_t tileHeight, + int32_t tileWidth, const ScheduleContext &context = 0) + { + if constexpr (PhyPosIsL1(INPUT_TYPE::pos) || INPUT_TYPE::layout != LayoutMode::NONE) { + ASCENDC_ASSERT((false), { + KERNEL_LOG(KERNEL_ERROR, "Matching error. MDL AsyncLoadData doesn't support BMM && Src L1"); + }); + } + + LocalTensor localTensor; + int32_t posL1 = BASE_MODULE::GetIterIndex(curRow, curCol); + int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { + return MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); + } else { + localTensor = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); + CopyTensor(localTensor, curRow, curCol, tileHeight, tileWidth); + MATMUL_MODULE(CubeInBuffer)->EnQue(localTensor); + return localTensor; + } + } + + __aicore__ inline void ClearLoadData(const LocalTensor &tensor = NULL_TENSOR, int32_t curRow = 0, + int32_t curCol = 0) + { + auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + MATMUL_MODULE(CubeInBuffer)->FreeTensor(bufferPos); + } + + __aicore__ inline void AwaitLoadData() + { + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + +private: + __aicore__ inline void CopyTensor(const LocalTensor &l1, int32_t curRow, int32_t curCol, int32_t tileHeight, + int32_t tileWidth) + { + CopyIn copyIn; + if constexpr (!INPUT_TYPE::isTrans) { + copyIn(l1, MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor(), + curRow, curCol, tileHeight, tileWidth, + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth(), + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); + } else { + copyIn(l1, MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor(), + curCol, curRow, tileWidth, tileHeight, + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth(), + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); + } + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_base.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_base.h index 333e879c..6115778d 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_base.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_base.h @@ -30,8 +30,8 @@ class CopyCubeInBase MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(MatmulShapeInfo); - using SrcT = typename INPUT_TYPE::T; using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename Conditional::value, float8_e8m0_t, typename INPUT_TYPE::T>::type; public: __aicore__ inline void Init() diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_from_l1.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_from_l1.h index 00035af0..8379dee9 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_from_l1.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_from_l1.h @@ -34,7 +34,7 @@ GetCopyCubeInType() == CopyCubeInType::FROM_L1>> { MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; + using SrcT = typename Conditional::value, float8_e8m0_t, typename INPUT_TYPE::T>::type; public: __aicore__ inline CopyCubeIn() = default; -- Gitee From 4bd60f18a18f9cdc8ad0901d86e3b4d097e27337 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:19:57 +0000 Subject: [PATCH 26/56] add Signed-off-by: jiangchengcheng-on --- .../copy_cube_in/base/copy_cube_in_intf.h | 19 +- .../copy_cube_in/base/copy_cube_in_norm.h | 5 - .../copy_cube_in/base/copy_cube_in_params.h | 417 +++++++++++++++++- .../base/copy_cube_in_ubtol1_singleshape.h | 100 +++++ .../copy_cube_in/base/copy_cube_in_utils.h | 15 +- 5 files changed, 522 insertions(+), 34 deletions(-) create mode 100644 impl/matmul/stage/copy_cube_in/base/copy_cube_in_ubtol1_singleshape.h diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_intf.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_intf.h index b14c9112..51668720 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_intf.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_intf.h @@ -1,12 +1,12 @@ /** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ /*! * \file copy_cube_in_intf.h * \brief @@ -24,7 +24,8 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. CopyCubeIn is only for internal usage, does not support extension or customized specialization! */ -template +template class...> class CopyCubeIn { using TransT = typename INPUT_TYPE::TRANS_T; diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_norm.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_norm.h index 93799ce6..240a336b 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_norm.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_norm.h @@ -124,11 +124,6 @@ public: __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, int32_t curRow = 0, int32_t curCol = 0) { -#if __CCE_AICORE__ == 310 - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { - return; - } -#endif auto posL1 = GetIterIndex(curRow, curCol); MATMUL_MODULE(CubeInBuffer)->FreeTensor(posL1, tensor); } diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_params.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_params.h index 8eb93d2b..5485f013 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_params.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_params.h @@ -21,6 +21,13 @@ namespace AscendC { namespace Impl { namespace Detail { + +template +__aicore__ inline constexpr static bool IsNeedC0Align() +{ + return IsSuppportB8() || IsSuppportB4(); +} + template class CopyCubeInParams { @@ -120,15 +127,11 @@ public: __aicore__ inline int32_t GetBufferSize() { -#if __CCE_AICORE__ == 310 - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos) && MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { return GetOrgSize(); } else { return GetBaseSize(); } -#else - return GetBaseSize(); -#endif } __aicore__ inline int GetDepth() const @@ -249,7 +252,7 @@ private: { if constexpr (IsSameTypeV) { return Align(GetBaseHeight(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV && INPUT_TYPE::isTrans) { + } else if constexpr (IsNeedC0Align() && INPUT_TYPE::isTrans) { return Align(GetBaseHeight(), c0Size_); } else { return GetBaseHeight(); @@ -258,9 +261,9 @@ private: __aicore__ inline int32_t GetBaseWidthAlign() const { - if constexpr (INPUT_TYPE::isTrans && IsSameTypeV) { + if constexpr (IsSameTypeV && INPUT_TYPE::isTrans) { return Align(GetBaseWidth(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV) { + } else if constexpr (IsTypeOneOfV || IsNeedC0Align()) { return Align(GetBaseWidth(), c0Size_); } else { return GetBaseWidth(); @@ -279,7 +282,7 @@ private: __aicore__ inline int32_t GetBaseSize() { - if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { + if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !(IsTypeOneOfV || IsNeedC0Align())) { return GetBaseWidth(); } else { return GetBaseHeightAlign() * GetBaseWidthAlign(); @@ -387,15 +390,11 @@ public: __aicore__ inline int32_t GetBufferSize() { -#if __CCE_AICORE__ != 310 - return GetBaseSize(); -#else - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos) && MatmulFeatureTrait::IsSupportUBToL1Singleshape()) { return GetOrgSize(); } else { return GetBaseSize(); } -#endif } __aicore__ inline int GetDepth() const @@ -514,9 +513,9 @@ private: __aicore__ inline int32_t GetBaseHeightAlign() const { - if constexpr (IsSameTypeV) { + if constexpr (IsSameTypeV && !INPUT_TYPE::isTrans) { return Align(GetBaseHeight(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV) { + } else if constexpr (IsNeedC0Align()) { return Align(GetBaseHeight(), c0Size_); } else { return GetBaseHeight(); @@ -525,13 +524,197 @@ private: __aicore__ inline int32_t GetBaseWidthAlign() const { - if constexpr (IsSameTypeV || (IsTypeOneOfV && !INPUT_TYPE::isTrans)) { + if constexpr (IsSameTypeV || (IsNeedC0Align() && !INPUT_TYPE::isTrans)) { return Align(GetBaseWidth(), c0Size_); } else { return GetBaseWidth(); } } + __aicore__ inline int32_t GetBaseSize() + { + if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !(IsTypeOneOfV || IsNeedC0Align())) { + return GetBaseWidth(); + } else { + return GetBaseHeightAlign() * GetBaseWidthAlign(); + } + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); +}; + +template +class CopyCubeInParams> +{ + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MLoop); + MATMUL_USE_MODULE(KLoop); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::TRANS_T; + +public: + // only support scaleA isTrans=false, [baseM, baseK/32] + __aicore__ inline bool IsTranspose() + { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeScaleA(); + } + + template + __aicore__ inline int32_t GetStepCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } + } + + __aicore__ inline int32_t GetBufferPos() + { + if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + return MATMUL_MODULE(KLoop)->IsScaleAKL1FullLoad() ? MATMUL_MODULE(MLoop)->GetOuterIdx() : + MATMUL_MODULE(KLoop)->GetOuterScaleKaIdx(); + } else { + return 0; + } + } + + __aicore__ inline int32_t GetBufferSize() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return GetOrgSize(); + } else { + return GetBaseSize(); + } + } + + __aicore__ inline int GetDepth() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthA1(); + } + + __aicore__ inline int GetScaleFactor() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetScaleFactorA(); + } + + template + __aicore__ inline int32_t GetOrgHeight() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleHeight(), BLOCK_CUBE); + } else { + return GetSingleHeight(); + } + } else { + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKa(), MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(); + } + } + } + + template + __aicore__ inline int32_t GetOrgWidth() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleWidth(), c0Size_); + } else { + return GetSingleWidth(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKa(), MX_K_FACTOR); + } + } + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + // Constantized scenario + // You can set IS_BASIC to false, if you don't need to use constantized parameters + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(), + MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreM(); + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + // Constantized scenario + // You can set IS_BASIC to false, if you don't need to use constantized parameters + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreM(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(), + MX_K_FACTOR); + } + } + + template + __aicore__ inline int32_t GetBaseHeight() const + { + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } + } + + template + __aicore__ inline int32_t GetBaseWidth() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), MX_K_FACTOR); + } + } + + template + __aicore__ inline bool IsKRowDirec() const + { + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeScaleA(); + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline int32_t GetBaseHeightAlign() const + { + if constexpr (IsNeedC0Align()) { + return Align(GetBaseHeight(), c0Size_); + } else { + return GetBaseHeight(); + } + } + + __aicore__ inline int32_t GetBaseWidthAlign() const + { + return GetBaseWidth(); + } + + __aicore__ inline int32_t GetOrgHeightAlign() + { + return Align(GetOrgHeight(), BLOCK_CUBE); + } + + __aicore__ inline int32_t GetOrgWidthAlign() + { + return Align(GetOrgWidth(), c0Size_); + } + __aicore__ inline int32_t GetBaseSize() { if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { @@ -541,6 +724,206 @@ private: } } + __aicore__ inline int32_t GetOrgSize() + { + return GetOrgHeightAlign() * GetOrgWidthAlign(); + } +}; + +template +class CopyCubeInParams> +{ + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(NLoop); + MATMUL_USE_MODULE(KLoop); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::TRANS_T; + +public: + // only support scaleB isTrans=true, [baseN, baseK/32] + __aicore__ inline bool IsTranspose() + { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeScaleB(); + } + + template + __aicore__ inline int32_t GetStepCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } + } + + __aicore__ inline int32_t GetBufferPos() + { + if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + return MATMUL_MODULE(KLoop)->IsScaleBKL1FullLoad() ? MATMUL_MODULE(NLoop)->GetOuterIdx() : + MATMUL_MODULE(KLoop)->GetOuterScaleKbIdx(); + } else { + return 0; + } + } + + __aicore__ inline int32_t GetBufferSize() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return GetOrgSize(); + } else { + return GetBaseSize(); + } + } + + __aicore__ inline int GetDepth() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthB1(); + } + + __aicore__ inline int GetScaleFactor() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetScaleFactorB(); + } + + template + __aicore__ inline int32_t GetOrgHeight() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleHeight(), BLOCK_CUBE); + } else { + return GetSingleHeight(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKb(), MX_K_FACTOR); + } + } + } + + template + __aicore__ inline int32_t GetOrgWidth() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleWidth(), c0Size_); + } else { + return GetSingleWidth(); + } + } else { + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKb(), MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(); + } + } + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + // Constantized scenario + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreN(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(), + MX_K_FACTOR); + } + } + + template + __aicore__ inline int32_t GetBaseHeight() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } else { + return Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), MX_K_FACTOR); + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(), + MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreN(); + } + } + + template + __aicore__ inline int32_t GetBaseWidth() const + { + if constexpr (IS_TRANS) { + return Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), MX_K_FACTOR); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } + } + + template + __aicore__ inline int32_t GetTotalRow() + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(NLoop)->GetTotalIter(); + } else { + return MATMUL_MODULE(KLoop)->GetTotalIter(); + } + } + + template + __aicore__ inline int32_t GetTotalCol() + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(KLoop)->GetTotalIter(); + } else { + return MATMUL_MODULE(NLoop)->GetTotalIter(); + } + } + + template + __aicore__ inline bool IsKRowDirec() const + { + return !MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeScaleB(); + } + +private: + __aicore__ inline int32_t GetOrgSize() + { + return Align(GetOrgHeight(), BLOCK_CUBE) * Align(GetOrgWidth(), c0Size_); + } + + __aicore__ inline int32_t GetBaseHeightAlign() const + { + if constexpr (IsNeedC0Align()) { + return Align(GetBaseHeight(), c0Size_); + } else { + return GetBaseHeight(); + } + } + + __aicore__ inline int32_t GetBaseWidthAlign() const + { + if constexpr (IsNeedC0Align() && !INPUT_TYPE::isTrans) { + return Align(GetBaseWidth(), c0Size_); + } else { + return GetBaseWidth(); + } + } + + __aicore__ inline int32_t GetBaseSize() + { + if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !(IsTypeOneOfV || IsNeedC0Align())) { + return GetBaseWidth(); + } else { + return GetBaseHeightAlign() * GetBaseWidthAlign(); + } + } + private: constexpr static int32_t c0Size_ = AuxGetC0Size(); }; diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_ubtol1_singleshape.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_ubtol1_singleshape.h new file mode 100644 index 00000000..341f6422 --- /dev/null +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_ubtol1_singleshape.h @@ -0,0 +1,100 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_in_ubtol1_singleshape.h + * \brief + */ + + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_UBTOL1_SINGLESHAPE_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_UBTOL1_SINGLESHAPE_H + +#include "../copy_tile_to_cube/copy_tile_to_cube.h" +#include "copy_cube_in_intf.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeIn() == CopyCubeInType::UBTOL1_SINGLESHAPE>> +{ + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeInfo); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename Conditional::value, float8_e8m0_t, typename INPUT_TYPE::T>::type; + +public: + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + + __aicore__ inline void Init() { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(CopyCubeInParams)->GetDepth()); + } + + __aicore__ inline void Reset() {} + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetLocalTensor(localMatrix, isTranspose); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = {}) + { + LocalTensor l1; + + TBuffAddr tbuffTmp; + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + tbuffTmp.logicPos = (uint8_t)(TPosition::A1); + } else { + tbuffTmp.logicPos = (uint8_t)(TPosition::B1); + } + tbuffTmp.bufferAddr = MATMUL_MODULE(CubeInBuffer)->GetBufferHeadAddr(); + +#ifdef ASCENDC_CPU_DEBUG + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + tbuffTmp.dataLen = MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK() * sizeof(TransT); + } else { + tbuffTmp.dataLen = MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK() * MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN() * sizeof(TransT); + } + tbuffTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::A1)) + tbuffTmp.bufferAddr; +#endif + + l1.SetAddr(tbuffTmp); + return l1; + } + + __aicore__ inline void AllocTensor(int32_t iterIndex = 0) {} + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) {} + + __aicore__ inline void Destroy() {} +}; +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_UBTOL1_SINGLESHAPE_H diff --git a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_utils.h b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_utils.h index 1b89a86c..ea06801d 100644 --- a/impl/matmul/stage/copy_cube_in/base/copy_cube_in_utils.h +++ b/impl/matmul/stage/copy_cube_in/base/copy_cube_in_utils.h @@ -26,6 +26,9 @@ enum class CopyCubeInType : uint8_t { BMM, FROM_L1, SPARSE_MDL, + UBTOL1_SINGLESHAPE, + MX_NORM, + MX_MDL, }; template @@ -43,27 +46,33 @@ __aicore__ inline constexpr bool IsCopyFromUB() template __aicore__ inline constexpr bool IsBMMFromL1() { - return PhyPosIsL1(INPUT_TYPE::pos) && (INPUT_TYPE::layout == LayoutMode::NORMAL) && - (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1); + return PhyPosIsL1(INPUT_TYPE::pos) && (INPUT_TYPE::layout != LayoutMode::NORMAL || + ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1); } template __aicore__ inline constexpr CopyCubeInType GetCopyCubeInType() { - if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { + if constexpr (InputPhyPosIsL1()) { return CopyCubeInType::FROM_L1; + } else if constexpr (MatmulFeatureTrait::IsSupportUBToL1Singleshape() && InputPhyPosIsUB()) { + return CopyCubeInType::UBTOL1_SINGLESHAPE; } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { return CopyCubeInType::NORMAL; } else if constexpr (DoMatmulNorm(MM_CFG)) { if constexpr (INPUT_TYPE::layout != LayoutMode::NONE && ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1) { return CopyCubeInType::BMM; + } else if constexpr (IsScaleTag()) { + return CopyCubeInType::MX_NORM; } else { return CopyCubeInType::NORMAL; } } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { if constexpr (HasSparseIndex()) { return CopyCubeInType::SPARSE_MDL; + } else if constexpr (IsScaleTag()) { + return CopyCubeInType::MX_MDL; } else { return CopyCubeInType::MDL; } -- Gitee From 04ac56f38095157b3eba114eef03f9b54f1761af Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:21:55 +0000 Subject: [PATCH 27/56] add Signed-off-by: jiangchengcheng-on --- .../copy_cube_in/batch/batch_copy_cube_in.h | 2 +- .../batch/batch_copy_cube_in_params.h | 2 +- .../copy_cube_in/bias/copy_bias_in_batch.h | 6 +++--- .../stage/copy_cube_in/bias/copy_bias_in_intf.h | 4 ++-- .../stage/copy_cube_in/bias/copy_bias_in_v220.h | 7 ++++--- .../stage/copy_cube_in/bias/load_bias_to_c2.h | 17 ++++++++++++++--- 6 files changed, 25 insertions(+), 13 deletions(-) diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h index f4e75e59..be535e2f 100644 --- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -261,7 +261,7 @@ private: int64_t dstOffset = batchNum * splitIdx * dstStride; // 3. loop copy NZ data by batch - bool iskRowDirec = IS_KROW && IsSameTypeV; + bool iskRowDirec = IS_KROW && IsSuppportB8(); auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride; GlobalTensor srcGlobal; srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); diff --git a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h index d5ea033f..ea2979a1 100644 --- a/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h +++ b/impl/matmul/stage/copy_cube_in/batch/batch_copy_cube_in_params.h @@ -81,7 +81,7 @@ public: template __aicore__ inline int64_t GetSingleSizeAlign() const { - if constexpr (IS_KROW && IsSameTypeV) { + if constexpr (IS_KROW && IsSuppportB8()) { return CeilAlign(GetSingleHeight(), c0Size_) * CeilAlign(GetSingleWidth(), c0Size_); } else { diff --git a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_batch.h b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_batch.h index bef7999f..000a3fd4 100644 --- a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_batch.h +++ b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_batch.h @@ -55,17 +55,17 @@ private: __aicore__ inline void BiasCopy(LocalTensor& bias, TensorT& srcTensor, int32_t dataLen, int32_t dataNum, int32_t srcOffset) { + Nd2NzParams intriParams {1, 1, static_cast(dataLen), 0, static_cast(dataLen), 1, 1, 0}; // Check if the bias is batched or not if constexpr (!ToMatmulConfig(MM_CFG).isBiasBatch) { // Not batched, only copy the data once - DataCopy(bias, srcTensor, { 1, 1, static_cast(dataLen), 0, 1, 1, 1, 0 }); + DataCopy(bias, srcTensor, intriParams); } else { // Batched, copy the data one by one int32_t dstOffset = 0; auto dstStride = CeilAlign(dataLen, c0Size_); for (int32_t i = 0; i < dataNum; ++i) { - DataCopy(bias[dstOffset], srcTensor[srcOffset], - { 1, 1, static_cast(dataLen), 0, 1, 1, 1, 0 }); + DataCopy(bias[dstOffset], srcTensor[srcOffset], intriParams); srcOffset += dataLen; dstOffset += dstStride; } diff --git a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_intf.h b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_intf.h index 946682ef..38cbb34d 100644 --- a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_intf.h +++ b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_intf.h @@ -28,8 +28,8 @@ namespace Detail { template class CopyBiasIn { using BiasT = typename BIAS_TYPE::T; - using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || !MatmulFeatureTrait::IsSupportUBToL1()), - GlobalTensor, LocalTensor>::type; + using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || (!MatmulFeatureTrait::IsSupportUBToL1() + && !MatmulFeatureTrait::IsSupportUBToL1Singleshape())), GlobalTensor, LocalTensor>::type; public: __aicore__ inline CopyBiasIn() = default; __aicore__ inline ~CopyBiasIn() = default; diff --git a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_v220.h b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_v220.h index 20fc8cc6..49220347 100644 --- a/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_v220.h +++ b/impl/matmul/stage/copy_cube_in/bias/copy_bias_in_v220.h @@ -41,8 +41,8 @@ class CopyBiasIn::IsSupportUBToL1()), - GlobalTensor, LocalTensor>::type; + using TensorT = typename Conditional<(PhyPosIsGM(BIAS_TYPE::pos) || (!MatmulFeatureTrait::IsSupportUBToL1() + && !MatmulFeatureTrait::IsSupportUBToL1Singleshape())), GlobalTensor, LocalTensor>::type; public: __aicore__ inline CopyBiasIn() = default; @@ -71,7 +71,8 @@ public: } } } - DataCopy(bias, srcTensor[srcOffset], { 1, 1, static_cast(dataLen), 0, 1, 1, 1, 0 }); + Nd2NzParams intriParams {1, 1, static_cast(dataLen), 0, static_cast(dataLen), 1, 1, 0}; + DataCopy(bias, srcTensor[srcOffset], intriParams); } } diff --git a/impl/matmul/stage/copy_cube_in/bias/load_bias_to_c2.h b/impl/matmul/stage/copy_cube_in/bias/load_bias_to_c2.h index c752e58c..d95c4a58 100644 --- a/impl/matmul/stage/copy_cube_in/bias/load_bias_to_c2.h +++ b/impl/matmul/stage/copy_cube_in/bias/load_bias_to_c2.h @@ -19,6 +19,9 @@ namespace AscendC { namespace Impl { namespace Detail { +constexpr int32_t DOUBLE_NUM = 2; +constexpr int32_t LEN_CEIL_DIV_NUM = 32; + /** * LoadBias2C2: responsible for load bias data into C2 buffer. * This module provides ablities to copy bias data in C2 Buffer. @@ -27,7 +30,7 @@ namespace Detail { */ template class LoadBias2C2 { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using BiasT = typename BIAS_TYPE::T; public: @@ -52,7 +55,7 @@ class LoadBias2C2 || DoMatmulSpecialMDL(MM_CFG) || IsBasicBlockEnable || DoMatmulIBShareNorm(MM_CFG))>> { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using BiasT = typename BIAS_TYPE::T; public: @@ -61,7 +64,15 @@ public: __aicore__ inline void Load(const LocalTensor& biasC2, const LocalTensor& bias, int32_t dataLen) { - constexpr auto biasType = IsSameType::value ? 2 : 1; // 2:f32, 1:f16 + constexpr auto biasType = IsSameType::value ? 2 : 1; // 2:f32, 1:f16/bf16 + if (MatmulFeatureTrait::IsSupportLoad2dV2()) { + uint16_t lenBurst = CeilDiv(dataLen * biasType * DOUBLE_NUM, LEN_CEIL_DIV_NUM); + if constexpr(IsSameType::value) { + lenBurst = CeilAlign(lenBurst, DOUBLE_NUM); + } + DataCopy(biasC2, bias, {1, lenBurst, 0, 0}); + return; + } uint16_t lenBurst = (dataLen * biasType * 2 + 63) / 64; DataCopy(biasC2, bias, {1, lenBurst, 0, 0}); } -- Gitee From 8ebbb72fc9ca43a1db364a4fdb0c307aa9d44ce7 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:23:30 +0000 Subject: [PATCH 28/56] add Signed-off-by: jiangchengcheng-on --- .../copy_tile_to_cube_common.h | 64 +++++++++++++++++-- .../copy_tile_to_cube/data_copy_wrapper.h | 3 + .../data_copy_wrapper_intf.h | 6 ++ .../copy_tile_to_cube/data_copy_wrapper_nd.h | 5 +- .../copy_tile_to_cube/data_copy_wrapper_nz.h | 3 +- .../data_copy_wrapper_using_ub_nd.h | 3 +- 6 files changed, 75 insertions(+), 9 deletions(-) diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/copy_tile_to_cube_common.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/copy_tile_to_cube_common.h index 321be52e..b52e9a10 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/copy_tile_to_cube_common.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/copy_tile_to_cube_common.h @@ -22,7 +22,8 @@ namespace Impl { namespace Detail { template -class CopyTileToCubeWrapper::IsNeedUB()>> { +class CopyTileToCubeWrapper::IsNeedUB() && + !(INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB)>> { using TransT = typename INPUT_TYPE::TRANS_T; using SrcT = typename INPUT_TYPE::T; @@ -50,7 +51,7 @@ public: CopyTileToCubeByCallBack(dst, curRow, curCol, tileHeight, tileWidth); } else { constexpr int32_t widthFactor = - IsSameTypeV && INPUT_TYPE::format == CubeFormat::ND ? INT4_TWO : 1; + IsSuppportB4() && INPUT_TYPE::format == CubeFormat::ND ? INT4_TWO : 1; if (IsTranspose()) { if constexpr (IsCopyFromUB()) { LocalTensor src; @@ -131,9 +132,17 @@ private: __aicore__ inline bool IsTranspose() { if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { - return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeA(); + if constexpr(INPUT_TYPE::format == CubeFormat::COLUMN_MAJOR) { + return !MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeA(); + } } else { - return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeB(); + if constexpr(INPUT_TYPE::format == CubeFormat::COLUMN_MAJOR) { + return !MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeB(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeB(); + } } } @@ -207,6 +216,41 @@ private: } } + __aicore__ inline void CopyDN2NZForInt8(const LocalTensor& dst, const GlobalTensor& src, + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, int32_t baseHeight, int32_t baseWidth, + int32_t orgHeight, int32_t orgWidth, int32_t stepCol, bool iskRowDirec) + { + if (tileWidth < baseWidth || baseWidth % c0Size_ == 0 || stepCol == 1) { + MATMUL_MODULE(DataCopyWrapper)->CopyDN2NZ(dst, src, curRow * baseHeight, curCol * baseWidth, tileHeight, + tileWidth, orgWidth, 1, 0, 0, iskRowDirec); + } else { + if ((stepCol - 1) * baseWidth > tileWidth) { + stepCol = Ceil(tileWidth, baseWidth); + } + int32_t dstNzMatrixStride = CeilAlign(baseWidth, c0Size_) * CeilAlign(tileHeight, c0Size_); + if (likely(dstNzMatrixStride <= UINT16_MAX)) { + MATMUL_MODULE(DataCopyWrapper)->CopyDN2NZ(dst, src, curRow * baseHeight, + curCol * baseWidth, tileHeight, baseWidth, orgWidth, stepCol - 1, baseWidth, + dstNzMatrixStride, iskRowDirec); + MATMUL_MODULE(DataCopyWrapper)->CopyDN2NZ(dst[(stepCol - 1) * dstNzMatrixStride], src, + curRow * baseHeight, (curCol + stepCol - 1) * baseWidth, tileHeight, + tileWidth - (stepCol - 1) * baseWidth, orgWidth, 1, 0, 0, iskRowDirec); + } else { + int32_t colIndex = curCol * baseWidth; + int32_t dstOffset = 0; + for (int i = 0; i < stepCol; ++i) { + if (i == stepCol - 1) { + baseWidth = tileWidth - (stepCol - 1) * baseWidth; + } + MATMUL_MODULE(DataCopyWrapper)->CopyDN2NZ(dst[dstOffset], src, curRow * baseHeight, colIndex, tileHeight, + baseWidth, orgWidth, 1, 0, 0, iskRowDirec); + colIndex += baseWidth; + dstOffset += dstNzMatrixStride; + } + } + } + } + template __aicore__ inline void CopyTileToCubeFromGM(const LocalTensor& dst, const GlobalTensor& src, int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, int32_t widthFactor) @@ -228,6 +272,16 @@ private: } else { MATMUL_MODULE(DataCopyWrapper)->CopyND2NZ(dst, src, curRow * baseHeight, curCol * baseWidth, tileHeight, tileWidth, orgWidth); } +#if defined(__DAV_C310__) + } else if constexpr (INPUT_TYPE::format == CubeFormat::COLUMN_MAJOR) { + if constexpr (sizeof(TransT) == sizeof(int8_t)) { + CopyDN2NZForInt8(dst, src, curRow, curCol, tileHeight, tileWidth, baseHeight, baseWidth, + orgHeight, orgWidth, stepCol, iskRowDirec); + } else { + MATMUL_MODULE(DataCopyWrapper)->CopyDN2NZ(dst, src, curRow * baseHeight, curCol * baseWidth, + tileHeight, tileWidth, orgWidth); + } +#endif } else if constexpr (INPUT_TYPE::format == CubeFormat::NZ) { MATMUL_MODULE(DataCopyWrapper)->CopyNZ2NZ(dst, src, curRow * baseHeight, curCol * baseWidth, tileHeight, tileWidth, orgHeight, iskRowDirec); @@ -237,7 +291,7 @@ private: return; } else { ASCENDC_ASSERT(false, - { KERNEL_LOG(KERNEL_ERROR, "MatmulApi only support input format ND/NZ/VECTOR/SCALAR."); }); + { KERNEL_LOG(KERNEL_ERROR, "Unsupported Matmul format type."); }); } } diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper.h index bcec5724..fc179a23 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper.h @@ -19,5 +19,8 @@ #include "data_copy_wrapper_nz.h" #include "data_copy_wrapper_vec.h" #include "data_copy_wrapper_using_ub_nd.h" +#if defined(__DAV_C310__) +#include "data_copy_wrapper_dn.h" +#endif #endif // IMPL_MATMUL_STAGE_COPY_CUBE_IN_COPY_TILE_TO_CUBE_DATA_COPY_WRAPPER_H diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_intf.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_intf.h index 7ca01a1a..efcd4773 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_intf.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_intf.h @@ -47,6 +47,12 @@ public: __aicore__ inline void CopyND2NZ(const LocalTensor& dst, const LocalTensor& src, const int32_t row, const int32_t col, const int32_t height, const int32_t width, const int32_t gCol) {} + // CopyDN2NZ, support for C310 + __aicore__ inline void CopyDN2NZ(const LocalTensor& dst, const GlobalTensor& src, + const int32_t row, const int32_t col, const int32_t height, const int32_t width, const int32_t gCol, + const int32_t dnNum = 1, const int32_t srcDnMatrixStride = 0, const int32_t dstNzMatrixStride = 0, + const bool kAlignToC0Size = false) {} + // CopyNZ2NZ, support for V200/V220 __aicore__ inline void CopyNZ2NZ(const LocalTensor& dst, const LocalTensor& src, const int32_t row, const int32_t col, const int32_t height, const int32_t width, const int32_t gRow) {} diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nd.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nd.h index 95fe52c4..8639326b 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nd.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nd.h @@ -24,7 +24,8 @@ namespace Detail { template class DataCopyWrapper::IsNeedUB() && INPUT_TYPE::format == CubeFormat::ND>> { + enable_if_t::IsNeedUB() && INPUT_TYPE::format == CubeFormat::ND && + !(INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB)>> { MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(LocalWorkspace); @@ -62,7 +63,7 @@ public: } } int64_t srcOffset; - if constexpr (IsSameTypeV) { + if constexpr (IsSuppportB4()) { srcOffset = ((int64_t)row * (int64_t)gCol * INT4_TWO + (int64_t)col); } else { srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col); diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nz.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nz.h index 3ba615c8..b26d9bed 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nz.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_nz.h @@ -23,7 +23,8 @@ namespace Impl { namespace Detail { template -class DataCopyWrapper> { +class DataCopyWrapper> { MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE(LocalWorkspace); diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_using_ub_nd.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_using_ub_nd.h index 6e0b15f9..30228c84 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_using_ub_nd.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_using_ub_nd.h @@ -24,7 +24,8 @@ namespace Detail { template class DataCopyWrapper ::IsNeedUB() && INPUT_TYPE::format == CubeFormat::ND>> { + enable_if_t::IsNeedUB() && INPUT_TYPE::format == CubeFormat::ND && + !(INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB)>> { MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(LocalWorkspace); -- Gitee From 1a546f7e20cf1ea947d8137af623a1748fef1840 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:35:19 +0000 Subject: [PATCH 29/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/stage/copy_cube_in/copy_cube_in.h | 3 +- .../data_copy_wrapper_utils.h | 5 +- .../copy_tile_to_cube/data_copy_wrapper_vec.h | 3 +- .../stage/copy_cube_out/copy_cube_out.h | 1 + .../stage/copy_cube_out/copy_cube_out_atop.h | 123 +++ .../copy_cube_out/copy_cube_out_datacopy.h | 844 +++++++++--------- 6 files changed, 545 insertions(+), 434 deletions(-) create mode 100644 impl/matmul/stage/copy_cube_out/copy_cube_out_atop.h diff --git a/impl/matmul/stage/copy_cube_in/copy_cube_in.h b/impl/matmul/stage/copy_cube_in/copy_cube_in.h index ec9224ca..83e2c199 100644 --- a/impl/matmul/stage/copy_cube_in/copy_cube_in.h +++ b/impl/matmul/stage/copy_cube_in/copy_cube_in.h @@ -7,7 +7,6 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ - /*! * \file copy_cube_in.h * \brief @@ -16,7 +15,9 @@ #ifndef IMPL_MATMUL_STAGE_COPY_CUBE_IN_BASE_COPY_CUBE_IN_H #define IMPL_MATMUL_STAGE_COPY_CUBE_IN_BASE_COPY_CUBE_IN_H +#include "base/copy_cube_in_atop.h" #include "base/copy_cube_in_from_l1.h" +#include "base/copy_cube_in_ubtol1_singleshape.h" #include "base/copy_cube_in_mdl.h" #include "base/copy_cube_in_norm.h" #if __CCE_AICORE__ == 220 diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_utils.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_utils.h index e8471b4d..4ccd3b0d 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_utils.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_utils.h @@ -29,6 +29,7 @@ constexpr int32_t EACH_BLOCK_BYTES_MM_API = 32; constexpr int32_t CACHE_LINE_SIZE_MM_API = 512; constexpr int32_t TRANS_DATA_ARRAY_SIZE_MM_API = 16; constexpr int32_t MAX_BLOCK_COUNT_SIZE_MM_API = 4095; +constexpr int32_t MM_NUM_TWO = 2; template __aicore__ inline void NDPadZeroForWidth(LocalTensor& dst, const int height, const int calcWidth, @@ -288,7 +289,7 @@ __aicore__ inline void CopyNZ2NZImpl(const LocalTensor& dst, const Globa auto alignHeight = Ceil(height, BLOCK_CUBE) * BLOCK_CUBE; int32_t blockLen = alignHeight * c0Size_ * sizeof(TransT) / ONE_BLK_SIZE; int32_t srcStride = (alignedGRow - alignHeight) * (c0Size_ * sizeof(TransT) / ONE_BLK_SIZE); - if constexpr (IsSameTypeV) { + if constexpr (IsSuppportB4()) { blockLen /= INT4_TWO; srcStride /= INT4_TWO; } @@ -300,7 +301,7 @@ __aicore__ inline void CopyNZ2NZImpl(const LocalTensor& dst, const Globa } else { uint16_t nburst = Ceil(width, c0Size_); int32_t dstStride = 0; - if constexpr (IsSameTypeV) { + if constexpr (IsNeedC0Align()) { if (kAlignToC0Size) { auto alignHeightC0Size = Ceil(height, c0Size_) * c0Size_; dstStride = alignHeightC0Size - alignHeight; diff --git a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_vec.h b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_vec.h index cf73e0fb..1891d5cb 100644 --- a/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_vec.h +++ b/impl/matmul/stage/copy_cube_in/copy_tile_to_cube/data_copy_wrapper_vec.h @@ -22,7 +22,8 @@ namespace Impl { namespace Detail { template -class DataCopyWrapper> { +class DataCopyWrapper> { using TransT = typename INPUT_TYPE::TRANS_T; using SrcT = typename INPUT_TYPE::T; diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out.h b/impl/matmul/stage/copy_cube_out/copy_cube_out.h index 552a77e9..24947ed5 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out.h @@ -18,6 +18,7 @@ #if __CCE_AICORE__ >= 220 #include "copy_cube_out_fixpipe.h" + #include "copy_cube_out_atop.h" #else #include "copy_cube_out_datacopy.h" #endif diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_atop.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_atop.h new file mode 100644 index 00000000..eb6a623b --- /dev/null +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_atop.h @@ -0,0 +1,123 @@ +/* * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file copy_cube_out_atop.h + * \brief + */ + +#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_ATOP_H +#define IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_ATOP_H + +#include "../../utils/matmul_module.h" +#include "quant/quant_processor_utils.h" +#include "copy_cube_out_intf.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +/* + CopyCubeOut is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeOut is only for internal usage, does not support extension or customized specialization! +*/ +template class CopyOut> +class CopyCubeOut()>, + CopyOut> { + using DstT = typename C_TYPE::T; + using SrcT = typename GetMmDstType::Type; + + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulSubBlockInfo); + +public: + __aicore__ inline CopyCubeOut() = default; + __aicore__ inline ~CopyCubeOut() = default; + + template + __aicore__ inline void Copy(const GlobalTensor &gm, const LocalTensor &co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + const ScheduleContext &context = 0) + { + CopyOut copyOut; + copyOut(gm, co1Local, curRow, curCol, baseHeight, baseWidth, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKc()); + } + + template + __aicore__ inline void Copy(const LocalTensor &co2Local, const LocalTensor &co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + const ScheduleContext &context = 0) + { + CopyOut copyOut; + copyOut(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKc(), + MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx()); + } +}; + +template class CopyOut> +class CopyCubeOut()>, + CopyOut> { + using DstT = typename C_TYPE::T; + using SrcT = typename GetMmDstType::Type; + + MATMUL_USE_MODULE(MatmulQuantProcessor); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + +public: + __aicore__ inline CopyCubeOut() = default; + __aicore__ inline ~CopyCubeOut() = default; + + template + __aicore__ inline void Copy(const GlobalTensor &gm, const LocalTensor &co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + const ScheduleContext &context = 0) + { + CopyOut copyOut; + if (MATMUL_MODULE(MatmulQuantProcessor)->IsPerChannelSenario()) { + LocalTensor quantTensor; + MATMUL_MODULE(MatmulQuantProcessor)->CopyQuantTensor(quantTensor, curCol, baseWidth); + copyOut(gm, co1Local, curRow, curCol, baseHeight, baseWidth, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKc(), quantTensor); + MATMUL_MODULE(MatmulQuantProcessor)->FreeQuantTensor(quantTensor); + } else { + copyOut(gm, co1Local, curRow, curCol, baseHeight, baseWidth, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(), + MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKc(), + MATMUL_MODULE(MatmulQuantProcessor)->GetQuantScalarValue()); + } + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif \ No newline at end of file diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h index 2abc716c..f011f33e 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -9,542 +9,526 @@ */ /*! - * \file copy_cube_out_datacopy.h + * \file copy_cube_out_datacopy_wrapper.h * \brief */ -#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H -#define IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H +#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H +#define IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H #include "../../utils/matmul_module.h" #include "../../utils/matmul_param.h" -#include "copy_cube_out_intf.h" -#include "copy_cube_out_datacopy_wrapper.h" +#include "copy_cube_out_utils.h" namespace AscendC { namespace Impl { namespace Detail { + +constexpr int32_t DOUBLE_SPACE = 2; +constexpr int32_t TWO_TIMES = 2; +constexpr int32_t EIGHT_TIMES = 8; +constexpr int32_t SHIFT_16_BIT = 16; +constexpr int32_t SHIFT_32_BIT = 32; +constexpr int32_t SHIFT_48_BIT = 48; +constexpr uint32_t MAX_REPEAT_STRIDE = 255; +constexpr int32_t PATTERN_SIZE = 8; +constexpr int32_t PATTERN_OFFSET = 2; /* - CopyCubeOut is considered entirely experimental. + CopyCubeOutWrapper is considered entirely experimental. We retain the freedom to make incompatible changes, but do not guarantee the stability. - CopyCubeOut is only for internal usage, does not support extension or customized specialization! + CopyCubeOutWrapper is only for internal usage, does not support extension or customized specialization! */ template -class CopyCubeOut::IsNeedUB())>> +class CopyCubeOutWrapper { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; - MATMUL_USE_MODULE(MatmulQuantProcessor); MATMUL_USE_MODULE(MatmulShapeInfo); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(LocalWorkspace); - MATMUL_USE_MODULE(CopyCubeOutUtils); public: - template - __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, - int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth, const ScheduleContext& context = 0) - { - CopyOutImpl(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); - } + __aicore__ inline CopyCubeOutWrapper() = default; + __aicore__ inline ~CopyCubeOutWrapper() = default; - template - __aicore__ inline void Copy(const LocalTensor& co2Local, const LocalTensor& co1Local, int32_t curRow, - int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth, const ScheduleContext& context = 0) + // get blockCount by DstT + __aicore__ inline constexpr int32_t GetBlockCount() { - CopyOutImpl(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); + return sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); } - template - __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co2Local, - const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, - int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, - const ScheduleContext& context = 0) + // if target is not aligned, must copy the unalign data to trans UB + __aicore__ inline bool IsNeedPadUnalignedToTrans(const int32_t baseWidth, const uint32_t dimN, + const bool isComputeLineByLine, const bool isTargetAligned) { - CopyOutImpl(gm, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); - } - -private: - template - __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, int32_t curRow, int32_t curCol, - int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth) - { - if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyOutNZ2ND(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); - } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - CopyOutNZ2NZ(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); + if constexpr (IsSameType::value) { + bool isOdd = false; + if constexpr (IsSameType::value || IsSameType::value) { + if (baseWidth % TWO_TIMES > 0) { + isOdd = true; + } + } + bool isSingleCore = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() <= MATMUL_MODULE(MatmulShapeInfo)-> + GetSingleCoreM()) && (dimN <= MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN()); + bool isMutiCoreNeedPad = !isSingleCore && !isComputeLineByLine; + return (!isTargetAligned && (isSingleCore || isMutiCoreNeedPad) && !isOdd); } else { - ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); + return (!isTargetAligned); } } - template - __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co2Local, - const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, - int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + // copy the unalign data to trans UB + template + __aicore__ inline void PadUnalignedToTrans(const LocalTensor& trans, const GlobalTensor& gm, + int32_t dstOffset, bool isComputeLineByLine, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { - if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyOutNZ2ND(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, - baseBlockHeight, baseBlockWidth); - } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - CopyOutNZ2NZ(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, - baseBlockHeight, baseBlockWidth); + int32_t alignedSize; + if constexpr (IsSameType::value || IsSameType::value) { + alignedSize = GetC0Size(); } else { - ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); + alignedSize = BLOCK_CUBE; } - } + int32_t baseUseN = CeilAlign(baseWidth, alignedSize); + int32_t gmTailOffset = dstOffset + baseUseN - GetBlockCount(); + int32_t transTailOffset = baseUseN - GetBlockCount(); - template - __aicore__ inline void CopyOutNZ2NZ(const LocalTensor& co2Local, const LocalTensor& co1Local, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, - int32_t baseBlockHeight, int32_t baseBlockWidth) - { - ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() >= MATMUL_MODULE(MatmulShapeTiling)-> - GetTiling().GetBaseM()), { KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be not less than baseM %d", - MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()); - }); + auto enQueEvtID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = baseBlockWidth; - dataCopyInfo.blockLen = baseBlockHeight; - dataCopyInfo.srcStride = 0; - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - if constexpr (enSequentialWrite) { - dataCopyInfo.dstStride = 0; - CopyCo12Co2WithQuant(co2Local, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, - enhancedParams); + if (isComputeLineByLine) { + if constexpr (enSequentialWrite) { + PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, baseUseN, baseHeight); + } else { + PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, + MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(), baseHeight); + } } else { - dataCopyInfo.dstStride = (CeilAlign(MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(), BLOCK_CUBE) - - baseBlockHeight * BLOCK_CUBE) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; - int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE + - curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * - MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(); - CopyCo12Co2WithQuant(co2Local[dstOffset], co1Local, curCol, baseBlockHeight, baseBlockWidth, - dataCopyInfo, enhancedParams); + PadUnalignedToTransWithStride(trans[transTailOffset], gm[gmTailOffset], baseHeight, baseWidth, baseBlockWidth); } + + // if copy gm to ub, must add the set/wait flag to wait the UB has be writed; + event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventIDMte2ToV); + WaitFlag(eventIDMte2ToV); } - template - __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, - int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth) + // trans nz buffer to nd buffer + __aicore__ inline auto TransNZ2NDByVec(const LocalTensor& trans, const LocalTensor& localBuf, + int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) { CopyOutEnQue(); - - LocalTensor localBuf = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); - CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); - + ASCENDC_ASSERT(((blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) <= MAX_REPEAT_TIMES), { + KERNEL_LOG(KERNEL_ERROR, "blockWidth is %d, blockCount is %d, repeat time exceed max time %d", blockWidth, + GetBlockCount(), MAX_REPEAT_TIMES); + }); + if constexpr (IsSameType::value || IsSameType::value) { + TransNZ2NDByVecDstB8(trans, localBuf, blockHigh, blockWidth, baseHeight, baseWidth, baseBlockWidth); + } else { + TransNZ2NDByVecDstNotB8(trans, localBuf, blockHigh, blockWidth, baseHeight, baseWidth, baseBlockWidth); + } CopyOutDeQue(); - - CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } + // copy trans buffer to gm template - __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co2Local, - const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, - int32_t baseBlockHeight, int32_t baseBlockWidth) + __aicore__ inline void CopyTrans2GM(const GlobalTensor& gm, const LocalTensor& trans, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + int32_t dstOffset, int32_t offset, int32_t dstStride, bool isComputeLineByLine, bool isTargetAligned) { - CopyOutNZ2NZ(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); - CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); - } - - template - __aicore__ inline void CopyOutNZ2ND(const LocalTensor& co2Local, const LocalTensor& co1Local, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, - int32_t baseBlockHeight, int32_t baseBlockWidth) - { - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() == 1), { KERNEL_LOG(KERNEL_ERROR, - "M_ is %d, which should be equal with 1.", MATMUL_MODULE(MatmulShapeInfo)->GetOrgM()); }); - - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; - - if constexpr (enSequentialWrite) { - DataCopy(co2Local, co1Local, dataCopyInfo, enhancedParams); - } else { - int32_t dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); - DataCopy(co2Local[dstOffset], co1Local, dataCopyInfo, enhancedParams); + int32_t blockLen = baseBlockWidth * (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE); + if constexpr (IsSameType::value || IsSameType::value) { + blockLen = Ceil(blockLen, TWO_TIMES); + } + if (!isComputeLineByLine) { + DataCopy(gm[dstOffset], trans, { static_cast(baseHeight), static_cast(blockLen), 0, + static_cast(dstStride) }); + return; + } + if constexpr (IsSameType::value) { + if constexpr (!enSequentialWrite) { + dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * + MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() + curCol * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + offset = MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); } - } else { - ASCENDC_ASSERT((!IsSameType::value && !IsSameType::value), - { KERNEL_LOG(KERNEL_ERROR, "Data format should be NZ if GetTensorC to UB when output is int8_t."); }); - - LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); - - CopyCo12Co2WithoutQuant(trans, co1Local, curCol, baseBlockHeight, baseBlockWidth); - - if constexpr(enSequentialWrite) { - TransNZ2NDForDstUB(co2Local, trans, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), - baseHeight, baseBlockWidth, baseBlockHeight); + int32_t newBlockCount; + if constexpr (IsSameType::value || IsSameType::value) { + newBlockCount = BLOCK_CUBE; } else { - uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? - MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); - int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + - curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); - TransNZ2NDForDstUB(co2Local[dstOffset], trans, CeilAlign(dimN, - MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()), baseHeight, baseBlockWidth, baseBlockHeight); + newBlockCount = ONE_BLK_SIZE / sizeof(DstT); } - } - } - - template - __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, - int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth) - { - CopyOutEnQue(); - - LocalTensor localBuf = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); - CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); - - CopyOutDeQue(); - - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, - baseBlockWidth); - } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ || IsSameType::value && - IsSameType::value) { - CopyLocal2GMNZ2NDOnTheFly(gm, localBuf, curRow, curCol, baseHeight, baseWidth, - baseBlockHeight, baseBlockWidth); + if (isTargetAligned) { + CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, newBlockCount, offset); + } else if (blockLen == 1) { + CopyTrans2GMByVecByLineUnalignOneBlock(gm[dstOffset], trans, baseHeight, baseWidth, blockLen, newBlockCount, offset); } else { - CopyLocal2GMNZ2NDByVec(gm, localBuf, curRow, curCol, baseHeight, baseWidth, - baseBlockHeight, baseBlockWidth); + if constexpr (IsSameType::value || IsSameType::value) { + CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); + } else { + CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); + } } + } else { + CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, ONE_BLK_SIZE / sizeof(DstT), offset); } } - template - __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co2Local, - const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, - int32_t baseBlockHeight, int32_t baseBlockWidth) + // if baseWidth is unaligned, then copy the tail data + __aicore__ inline void CopyLocal2GMNZ2NDOnTheFlyTail(const GlobalTensor& gm, const LocalTensor& localBuf, + int32_t baseHeight, int32_t baseWidth, int32_t iterIdx, int32_t calcWidth, const event_t& eventIDMte3ToMte2) { - CopyOutNZ2ND(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); - } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyOutNZ2ND(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->GetNZ2NDWorkspace().template ReinterpretCast(); + trans.SetSize(GetBlockCount()); + int32_t srcTailOffset = iterIdx * GetBlockCount() + calcWidth * GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()); + if (baseWidth * sizeof(DstT) > ONE_BLK_SIZE) { + CopyLocal2GMNZ2NDRegMov(gm, localBuf, trans, baseHeight, baseWidth, iterIdx, calcWidth, srcTailOffset); + } else { + if (iterIdx > 0) { + WaitFlag(eventIDMte3ToMte2); + } + if constexpr (IsSameType::value && + IsSameType::value) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); + SetFlag(eventID); + WaitFlag(eventID); + } + DataCopy(trans, gm[baseWidth], { 1, 1, 0, 0 }); + event_t eventIDMte2ToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); + SetFlag(eventIDMte2ToMte3); + WaitFlag(eventIDMte2ToMte3); + DataCopy(gm, localBuf[srcTailOffset], { 1, 1, 0, 0 }); + PipeBarrier(); + DataCopy(gm[baseWidth], trans, { 1, 1, 0, 0 }); + if (iterIdx < baseHeight - 1) { + SetFlag(eventIDMte3ToMte2); + } } } - __aicore__ inline void CopyCo12Co2WithQuant(const LocalTensor& dst, const LocalTensor& src, - int32_t curCol, int32_t baseBlockHeight, int32_t baseBlockWidth, DataCopyParams& dataCopyInfo, DataCopyEnhancedParams& enhancedParams) +private: + __aicore__ inline void PadUnalignedToTransByLine(const LocalTensor& trans, const GlobalTensor& gm, + int32_t transStride, int32_t gmStride, int32_t baseHeight) { - if constexpr (IsSameType::value) { - MATMUL_MODULE(MatmulQuantProcessor)->UpdateDataCopyParamForQuant(enhancedParams, curCol); - uint64_t alignedHeight = baseBlockHeight * BLOCK_CUBE; - int32_t blockOffset = BLOCK_CUBE * alignedHeight; - if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8 || - MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { - dataCopyInfo.blockLen = baseBlockHeight; - uint64_t addr = enhancedParams.deqTensorAddr; - int32_t offset = ONE_BLK_SIZE * alignedHeight; - int32_t dstOffset = 0; - constexpr int32_t WIDTH_SIZE = ONE_BLK_SIZE * ONE_BYTE_BIT_SIZE; - constexpr int32_t STORE_SIZE = BLOCK_CUBE * ONE_BYTE_BIT_SIZE; - for (int32_t i = 0; i < Ceil(baseBlockWidth, TWO_TIMES); ++i) { - for (int32_t storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { - if (baseBlockWidth % TWO_TIMES != 0 && - i == Ceil(baseBlockWidth, TWO_TIMES) - 1 && - storeMode == 1) { - continue; - } - if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { - enhancedParams.deqTensorAddr = addr + i * WIDTH_SIZE + storeMode * STORE_SIZE; - } - enhancedParams.sidStoreMode = (uint8_t)storeMode; - DataCopy(dst[dstOffset], src[dstOffset + storeMode * blockOffset], - dataCopyInfo, enhancedParams); - } - dstOffset += offset; - } - } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = baseBlockHeight; - dataCopyInfo.dstStride = 0; - uint64_t addr = enhancedParams.deqTensorAddr; - int32_t offset = 0; - constexpr int32_t DEQ_OFFSET = 128; - for (int32_t i = 0; i < baseBlockWidth; ++i) { - enhancedParams.deqTensorAddr = addr + i * DEQ_OFFSET; - DataCopy(dst[offset], src[offset], dataCopyInfo, enhancedParams); - offset += blockOffset; - } - } else { - DataCopy(dst, src, dataCopyInfo, enhancedParams); - } - } else { - DataCopy(dst, src, dataCopyInfo, enhancedParams); + // copy gm to trans one line by one line + int32_t dstOffset = 0; + int32_t srcOffset = 0; + int32_t blockLen = GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; + for (int32_t i = 0; i < baseHeight; ++i) { + DataCopy(trans[dstOffset], gm[srcOffset], { static_cast(1), + static_cast(blockLen), 0, 0 }); + dstOffset += transStride; + srcOffset += gmStride; } } - __aicore__ inline void CopyCo12Co2WithoutQuant(const LocalTensor& dst, const LocalTensor& src, int32_t curCol, - int32_t baseBlockHeight, int32_t baseBlockWidth) + __aicore__ inline void PadUnalignedToTransWithStride(const LocalTensor& trans, const GlobalTensor& gm, int32_t baseHeight, + int32_t baseWidth, int32_t baseBlockWidth) { - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = baseBlockWidth; - dataCopyInfo.blockLen = baseBlockHeight; - dataCopyInfo.srcStride = 0; - dataCopyInfo.dstStride = 0; - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - if constexpr (IsSameType::value) { - MATMUL_MODULE(MatmulQuantProcessor)->UpdateDataCopyParamForQuant(enhancedParams, curCol); - } - DataCopy(dst, src, dataCopyInfo, enhancedParams); + // copy gm to trans with stride + DataCopy(trans, gm, { static_cast(baseHeight), static_cast(1), + static_cast(MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() / GetBlockCount() - 1), + static_cast(baseWidth / GetBlockCount()) }); } - __aicore__ inline void CopyCo12Local(const LocalTensor& localBuf, const LocalTensor& co1Local, int32_t curCol, int32_t baseBlockHeight, int32_t baseBlockWidth) + __aicore__ inline void TransNZ2NDByVecDstB8(const LocalTensor& trans, const LocalTensor& localBuf, + int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) { - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; - DataCopyEnhancedParams enhancedParams; - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; - } else { - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - ASCENDC_ASSERT((localBuf.GetSize() >= dataCopyInfo.blockLen * CUBE_MAX_SIZE), { - KERNEL_LOG(KERNEL_ERROR, "copy len is %d, which should be less than dst size %d", - dataCopyInfo.blockLen * CUBE_MAX_SIZE, localBuf.GetSize()); - }); + struct UnaryRepeatParams intriParams; + intriParams.dstBlkStride = Ceil(baseWidth, ONE_BLK_SIZE); + intriParams.srcBlkStride = 1; + uint32_t dstRepStride = Ceil(baseWidth * sizeof(DstT), ONE_BLK_SIZE) * EIGHT_TIMES; + intriParams.dstRepStride = dstRepStride; + intriParams.srcRepStride = (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + int32_t dstOffset = 0; + int32_t srcOffset = 0; + int32_t highBlocks = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES / MAX_REPEAT_TIMES; + int32_t highTail = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES % MAX_REPEAT_TIMES; + uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; + // mov src to dst width aligned + LocalTensor src = localBuf.template ReinterpretCast(); + LocalTensor dst = trans.template ReinterpretCast(); + SetVectorMask(mask[1], mask[0]); + constexpr int64_t srcOffsetStride = BLOCK_CUBE * EIGHT_TIMES; + const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES / TWO_TIMES; + for (int32_t i = 0; i < Ceil(blockWidth, TWO_TIMES); ++i) { + if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { + // if the baseWidth is not aligned, set the mask value; + if (i == (Ceil(blockWidth, TWO_TIMES) - 1) && (baseWidth % GetBlockCount() != 0)) { + uint64_t masktail = (1 << (Ceil(baseWidth % GetBlockCount(), TWO_TIMES))) - 1; + mask[0] = + masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); + mask[1] = mask[0]; + SetVectorMask(mask[1], mask[0]); + } + } + int32_t dstMulsOffset = dstOffset; + for (int32_t j = 0; j < highBlocks; ++j) { + Muls( + dst[dstMulsOffset], src[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES, intriParams); + srcOffset += MAX_REPEAT_TIMES * BLOCK_CUBE; + dstMulsOffset += blockWidth * GetBlockCount() * MAX_REPEAT_TIMES; + } + if (highTail > 0) { + if (dstRepStride > MAX_REPEAT_STRIDE) { + int32_t tmpSrcOffset = srcOffset; + for (int32_t j = 0; j < highTail; j++) { + Muls(dst[dstMulsOffset], src[tmpSrcOffset], (int16_t)1, mask, 1, intriParams); + dstMulsOffset += dstOffsetStride; + tmpSrcOffset += srcOffsetStride; + } + } else { + Muls(dst[dstMulsOffset], src[srcOffset], (int16_t)1, mask, highTail, intriParams); + } + srcOffset += highTail * BLOCK_CUBE * EIGHT_TIMES; + } + dstOffset += BLOCK_CUBE; } - CopyCo12Co2WithQuant(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, enhancedParams); } - __aicore__ inline void CopyLocal2GMNZ2NZNotSeq(const GlobalTensor& gm, const LocalTensor& localBuf, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, - int32_t baseBlockWidth) + __aicore__ inline void TransNZ2NDByVecDstNotB8(const LocalTensor& trans, const LocalTensor& localBuf, + int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) { - int64_t alignM; - int alignBaseUseM; - if constexpr (C_TYPE::format == CubeFormat::NZ) { // nz2nz - alignM = Ceil(MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), BLOCK_CUBE) * BLOCK_CUBE; - alignBaseUseM = Ceil(baseHeight, BLOCK_CUBE) * BLOCK_CUBE; - } else { // nz2nd A is vector - alignM = MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(); - alignBaseUseM = baseHeight; - } - - int64_t dstOffset; - int64_t dstStride; - int blockLen; - int blockCount; - if constexpr (IsSameType::value || IsSameType::value) { - dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + - curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * ONE_BLK_SIZE; - dstStride = (alignM - alignBaseUseM) * sizeof(DstT); - blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT); - blockCount = Ceil(baseBlockWidth, TWO_TIMES); + struct UnaryRepeatParams intriParams; + intriParams.srcBlkStride = 1; + int32_t dstOffset = 0; + int32_t srcOffset = 0; + int32_t highBlocks = 0; + int32_t highTail = 0; + int32_t srcStride = MAX_REPEAT_TIMES * GetBlockCount(); + int32_t dstStride = blockWidth * GetBlockCount() * MAX_REPEAT_TIMES; + bool isBeyondMaxStride = false; + uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; + + if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { + intriParams.dstBlkStride = 1; + intriParams.dstRepStride = blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; + intriParams.srcRepStride = GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; + highBlocks = (blockHigh * GetBlockCount()) / MAX_REPEAT_TIMES; + highTail = (blockHigh * GetBlockCount()) % MAX_REPEAT_TIMES; + mask[0] = static_cast((1<< GetBlockCount()) - 1); + mask[1] = 0; } else { - dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + - curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE; - dstStride = (alignM - alignBaseUseM) * sizeof(DstT) * BLOCK_CUBE / ONE_BLK_SIZE; - blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT) * - BLOCK_CUBE / ONE_BLK_SIZE; - blockCount = baseBlockWidth; + intriParams.dstBlkStride = blockWidth; + uint32_t dstRepStride = (blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + intriParams.dstRepStride = dstRepStride; + if (dstRepStride > MAX_REPEAT_STRIDE) { + isBeyondMaxStride = true; + } + intriParams.srcRepStride = (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + highBlocks = (blockHigh * GetBlockCount()) / EIGHT_TIMES / MAX_REPEAT_TIMES; + highTail = (blockHigh * GetBlockCount()) / EIGHT_TIMES % MAX_REPEAT_TIMES; + srcStride *= EIGHT_TIMES; + dstStride *= EIGHT_TIMES; } - - if (dstStride >= UINT16_MAX) { - int32_t srcOffset = 0; - int32_t srcStride; - if constexpr (IsSameType::value || IsSameType::value) { - dstStride = alignM * ONE_BLK_SIZE; - srcStride = baseHeight * ONE_BLK_SIZE; - } else { - dstStride = alignM * BLOCK_CUBE; - srcStride = baseHeight * BLOCK_CUBE; + SetVectorMask(mask[1], mask[0]); + const int64_t srcOffsetStride = GetBlockCount() * EIGHT_TIMES; + const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES; + for (int32_t i = 0; i < blockWidth; ++i) { + if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { + // if the baseWidth is not aligned, set the mask value; + if (i == (blockWidth - 1) && (baseWidth % GetBlockCount() != 0)) { + uint64_t masktail = (1 << (baseWidth % GetBlockCount())) - 1; + mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); + mask[1] = mask[0]; + SetVectorMask(mask[1], mask[0]); + } } - for (int32_t i = 0; i < blockCount; ++i) { - DataCopy(gm[dstOffset], localBuf[srcOffset], { 1, static_cast(blockLen), 0, 0 }); - dstOffset += dstStride; + int32_t dstMulsOffset = dstOffset; + for (int32_t j = 0; j < highBlocks; ++j) { + Muls(trans[dstMulsOffset], localBuf[srcOffset], (DstT)1.0, mask, MAX_REPEAT_TIMES, intriParams); srcOffset += srcStride; + dstMulsOffset += dstStride; } - } else { - DataCopy(gm[dstOffset], localBuf, { static_cast(blockCount), static_cast(blockLen), 0, - static_cast(dstStride) }); + if (highTail) { + if (isBeyondMaxStride) { + for (int32_t j = 0; j < highTail; j++) { + Muls(trans[dstMulsOffset + j * dstOffsetStride], + localBuf[srcOffset + j * srcOffsetStride], (DstT)1.0, mask, 1, intriParams); + } + } else { + Muls(trans[dstMulsOffset], localBuf[srcOffset], (DstT)1.0, mask, highTail, intriParams); + } + if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { + srcOffset += highTail * GetBlockCount(); + } else { + srcOffset += highTail * srcOffsetStride; + } + } + dstOffset += GetBlockCount(); } } - __aicore__ inline void CopyLocal2GMNZ2NZSeq(const GlobalTensor& gm, const LocalTensor& localBuf, int32_t baseHeight, int32_t baseBlockWidth) + __aicore__ inline void CopyTrans2GMByVecByLineAlign(const GlobalTensor& gm, const LocalTensor& trans, int32_t baseHeight, + int32_t blockLen, int32_t blockCount, int32_t offset) { - int32_t blockLen = baseHeight * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; - DataCopy(gm, localBuf, { static_cast(baseBlockWidth), - static_cast(blockLen), 0, 0 }); + int32_t dstOffset = 0; + int32_t srcOffset = 0; + int32_t blockOffset = blockLen * blockCount; + for (int32_t i = 0; i < baseHeight; ++i) { + DataCopy(gm[dstOffset], trans[srcOffset], + { 1, static_cast(blockLen), 0, 0 }); + PipeBarrier(); + dstOffset += offset; + srcOffset += blockOffset; + } } - template - __aicore__ inline void CopyLocal2GMNZ2NZ(const GlobalTensor& gm, const LocalTensor& localBuf, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + __aicore__ inline void CopyTrans2GMByVecByLineUnalignOneBlock(const GlobalTensor& gm, const LocalTensor& trans, + int32_t baseHeight, int32_t baseWidth, int32_t blockLen, int32_t blockCount, int32_t offset) { - if constexpr (enSequentialWrite) { - CopyLocal2GMNZ2NZSeq(gm, localBuf, baseHeight, baseBlockWidth); - } else { - ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() >= baseHeight), { - KERNEL_LOG(KERNEL_ERROR, "M_ is %d, baseHeight is %d, M_ should be no less than baseHeight", - MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), baseHeight); - }); - CopyLocal2GMNZ2NZNotSeq(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + CopyTrans2GMEnQue(); + int32_t padLen = (ONE_BLK_SIZE - baseWidth * sizeof(DstT)) / sizeof(DstT); + SetAtomicAdd(); + int32_t dstOffset = 0; + for (int32_t i = 0; i < baseHeight; ++i) { + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0) + .template ReinterpretCast(); + int32_t transIndex = i * blockLen * blockCount; + for (int32_t j = 0; j < baseWidth; ++j) { + transAligin.SetValue(j, trans.GetValue(transIndex + j)); + } + for (int32_t j = baseWidth; j < blockCount; ++j) { + transAligin.SetValue(j, 0); + } + DataCopy(gm[dstOffset], transAligin, { 1, 1, 0, 0 }); + dstOffset += offset; + CopyLocal2GMNZ2NDDeQue(); } + SetAtomicNone(); } - __aicore__ inline void TransNZ2NDForDstUB(const LocalTensor& co2Local, const LocalTensor& trans, - int32_t dstStride, int32_t baseHeight, int32_t baseBlockWidth, int32_t baseBlockHeight) + template + __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, int32_t blockLen, + int32_t blockCount, int32_t offset) -> enable_if_t { - DataCopyParams dataCopyInfo { - static_cast(baseBlockWidth), - static_cast(MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE), - static_cast((baseBlockHeight * BLOCK_CUBE * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() - - MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) * sizeof(DstT) / ONE_BLK_SIZE), - 0 - }; + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0).template ReinterpretCast(); + int32_t remainLen = (baseWidth % blockCount) / TWO_TIMES; + CopyTrans2GMEnQue(); + LocalTensor src1Pattern; + src1Pattern = MATMUL_MODULE(LocalWorkspace)->template GetWorkspaceWithOffset< + ToMatmulConfig(MM_CFG).enableUBReuse>(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() + / TWO_TIMES).template ReinterpretCast(); + LocalTensor tmpSrc = trans.template ReinterpretCast(); + src1Pattern.SetSize(PATTERN_SIZE); + src1Pattern.SetValue(0, 0xFFFF << remainLen); + src1Pattern.SetValue(1, (1 << remainLen) - 1); + for (int32_t i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { + src1Pattern.SetValue(i, 0); + } + int32_t orinRemain = baseWidth % blockCount; + int32_t gmOffset = blockCount * (blockLen - PATTERN_OFFSET); int32_t dstOffset = 0; int32_t srcOffset = 0; - for (int32_t i = 0; i < baseHeight; i++) { - DataCopy(co2Local[dstOffset], trans[srcOffset], dataCopyInfo); - dstOffset += dstStride; - srcOffset += MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); + int32_t blockOffset = blockLen * blockCount; + for (int32_t i = 0; i < baseHeight; ++i) { + DataCopy(gm[dstOffset], trans[srcOffset], { 1, static_cast(blockLen - 1), 0, 0 }); + if (baseWidth % TWO_TIMES == 0) { + CopyOutEnQue(); + GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); + uint64_t rsvdCnt = 0; + GatherMask(transAligin, tmpSrc[((i + 1) * blockLen - PATTERN_OFFSET) * BLOCK_CUBE], + src1Pattern, false, 0, gatherMaskParams, rsvdCnt); + LocalTensor tmpTrans = transAligin.template ReinterpretCast(); + DataCopy(gm[dstOffset + gmOffset + remainLen * DOUBLE_SPACE], tmpTrans, { 1, 1, 0, 0 }); + } else { + CopyLocal2GMNZ2NDDeQue(); + LocalTensor tmpTrans = transAligin.template ReinterpretCast(); + for (int32_t j = 0; j < ONE_BLK_SIZE; ++j) { + tmpTrans.SetValue(j, trans[srcOffset + gmOffset + orinRemain].GetValue(j)); + } + CopyLocal2GMNZ2NDEnQue(); + DataCopy(gm[dstOffset + gmOffset + orinRemain], tmpTrans, { 1, 1, 0, 0 }); + } + PipeBarrier(); + dstOffset += offset; + srcOffset += blockOffset; } } - template - __aicore__ inline void CopyLocal2GMNZ2NDByVec(const GlobalTensor& gm, const LocalTensor& localBuf, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + template + __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, int32_t blockLen, + int32_t blockCount, int32_t offset) -> enable_if_t { - uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? - MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); - - LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->template + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0).template ReinterpretCast(); + int32_t remainLen = baseWidth % blockCount; + CopyTrans2GMEnQue(); + LocalTensor src1Pattern; + src1Pattern = MATMUL_MODULE(LocalWorkspace)->template GetWorkspaceWithOffset( - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()) - .template ReinterpretCast(); - int32_t transSize = localBuf.GetSize(); - if constexpr (IsSameType::value || IsSameType::value) { - if (baseBlockWidth % TWO_TIMES != 0) { - transSize += baseBlockHeight * CUBE_MAX_SIZE; - } + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() / TWO_TIMES).template ReinterpretCast(); + src1Pattern.SetSize(PATTERN_SIZE); + src1Pattern.SetValue(0, 0xFFFF << remainLen); + src1Pattern.SetValue(1, (1 << remainLen) - 1); + for (int32_t i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { + src1Pattern.SetValue(i, 0); } - trans.SetSize(transSize); - - int32_t dstOffset; - int32_t dstStride; - int32_t offset; - bool isGmAligned; - if constexpr (enSequentialWrite) { - dstOffset = 0; - dstStride = 0; - offset = baseWidth; - isGmAligned = ((baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0); - } else { - int32_t width = baseBlockWidth * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); - if constexpr (IsSameType::value || IsSameType::value) { - width = width / TWO_TIMES; - } - ASCENDC_ASSERT((dimN >= width), - { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); - if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { - isGmAligned = 1; - } else { - isGmAligned = ((dimN % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0 && - (MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN() % - MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0); - } - - dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + - curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); - dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; - offset = dimN; - } - bool isTargetAligned = (baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0; - const bool isComputeLineByLine = (!isGmAligned || dstStride >= UINT16_MAX); - - // 1 if target is not aligned, must copy the unalign data to trans UB - if (MATMUL_MODULE(CopyCubeOutUtils) - ->IsNeedPadUnalignedToTrans(baseWidth, dimN, isComputeLineByLine, isTargetAligned)) { - MATMUL_MODULE(CopyCubeOutUtils) - ->template PadUnalignedToTrans( - trans, gm, dstOffset, isComputeLineByLine, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + int32_t gmOffset = blockCount * (blockLen - PATTERN_OFFSET); + int32_t dstOffset = 0; + int32_t srcOffset = 0; + int32_t blockOffset = blockLen * blockCount; + for (int32_t i = 0; i < baseHeight; ++i) { + DataCopy(gm[dstOffset], trans[srcOffset], { 1, static_cast(blockLen - 1), 0, 0 }); + GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); + uint64_t rsvdCnt = 0; + CopyOutEnQue(); + GatherMask(transAligin, trans[srcOffset + gmOffset], + src1Pattern, false, 0, gatherMaskParams, rsvdCnt); + DataCopy(gm[dstOffset + gmOffset + remainLen], transAligin, { 1, 1, 0, 0 }); + PipeBarrier(); + dstOffset += offset; + srcOffset += blockOffset; } - - // 2. trans nz buffer to nd buffer - MATMUL_MODULE(CopyCubeOutUtils) - ->TransNZ2NDByVec(trans, localBuf, baseBlockHeight, baseBlockWidth, baseHeight, baseWidth, baseBlockWidth); - - // 3. copy trans buffer to gm - MATMUL_MODULE(CopyCubeOutUtils)->template CopyTrans2GM(gm, trans, curRow, curCol, baseHeight, - baseWidth, baseBlockHeight, baseBlockWidth, dstOffset, offset, dstStride, isComputeLineByLine, isTargetAligned); } - template - __aicore__ inline void CopyLocal2GMNZ2NDOnTheFly(const GlobalTensor& gm, const LocalTensor& localBuf, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, - int32_t baseBlockHeight, int32_t baseBlockWidth) + __aicore__ inline void CopyLocal2GMNZ2NDRegMov(const GlobalTensor& gm, const LocalTensor& localBuf, + LocalTensor& trans, int32_t baseHeight, int32_t baseWidth, int32_t iterIdx, int32_t calcWidth, + int32_t srcTailOffset) { - uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? - MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); - int32_t calcWidth = baseWidth / MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); - int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + - curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); - int32_t blockLen = MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; - int32_t srcRepeatGap = (baseBlockHeight * BLOCK_CUBE * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() - - MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) * sizeof(DstT) / ONE_BLK_SIZE; - int32_t tail = baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); - - int32_t offset = dimN; - if constexpr (enSequentialWrite) { - dstOffset = 0; - offset = baseWidth; + int32_t dstTailOffset = calcWidth * GetBlockCount(); + int32_t basicOffset = 0; + if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { + DataCopy(gm[dstTailOffset], localBuf[srcTailOffset], { 1, 1, 0, 0 }); + basicOffset = ONE_BLK_SIZE / sizeof(DstT); } - if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { - offset = CeilAlign(offset, MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()); - calcWidth = baseBlockWidth; - tail = 0; + // reg_mov + srcTailOffset = srcTailOffset + basicOffset - + GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()) + baseWidth % GetBlockCount(); + dstTailOffset = dstTailOffset + basicOffset + baseWidth % GetBlockCount() - GetBlockCount(); + if constexpr (IsSameType::value && + IsSameType::value) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); + SetFlag(eventID); + WaitFlag(eventID); + } + int32_t j = 0; + for (int32_t k = 0; k < GetBlockCount() - baseWidth % GetBlockCount(); j++, k++) { + DstT scalar = localBuf.GetValue(srcTailOffset + k); + trans.SetValue(j, scalar); + } + srcTailOffset = iterIdx * GetBlockCount() + calcWidth * GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()); + for (int32_t k = 0; k < baseWidth % GetBlockCount(); j++, k++) { + DstT scalar = localBuf.GetValue(srcTailOffset + k); + trans.SetValue(j, scalar); } - // Allocate MTE2_MTE3 eventId: eventIDMte3ToMte2 - event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->AllocEventID()); - int32_t srcOffset = 0; - for (int32_t i = 0; i < baseHeight; i++) { - if (calcWidth > 0) { - DataCopy(gm[dstOffset], localBuf[srcOffset], { static_cast(calcWidth), - static_cast(blockLen), static_cast(srcRepeatGap), 0 }); - if constexpr (IsSameType::value && - IsSameType::value) { - PipeBarrier(); - } - } - - if (tail != 0) { - MATMUL_MODULE(CopyCubeOutUtils)->CopyLocal2GMNZ2NDOnTheFlyTail( - gm[dstOffset], localBuf, baseHeight, baseWidth, i, calcWidth, eventIDMte3ToMte2); - } - dstOffset += offset; - srcOffset += MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); + CopyLocal2GMNZ2NDEnQue(); + // copy the tail from ub to gm + DataCopy(gm[dstTailOffset], trans, { 1, 1, 0, 0 }); + if constexpr (IsSameType::value && + IsSameType::value) { + CopyLocal2GMNZ2NDDeQue(); } - event_t eventID = static_cast(GetTPipePtr()->FetchEventID()); - SetFlag(eventID); - WaitFlag(eventID); - // Release MTE2_MTE3 eventId: eventIDMte3ToMte2 - GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte2); } }; } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H \ No newline at end of file +#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H \ No newline at end of file -- Gitee From 7c1c0ddcc6fc35b1ddbe111a656ea6b402d804a5 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:36:06 +0000 Subject: [PATCH 30/56] add Signed-off-by: jiangchengcheng-on --- .../copy_cube_out/copy_cube_out_datacopy.h | 846 +++++++++--------- .../copy_cube_out/copy_cube_out_fixpipe.h | 199 +++- .../stage/copy_cube_out/copy_cube_out_intf.h | 7 +- .../stage/copy_cube_out/copy_cube_out_utils.h | 122 ++- 4 files changed, 689 insertions(+), 485 deletions(-) diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h index f011f33e..5a84bf9f 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_datacopy.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -9,526 +9,542 @@ */ /*! - * \file copy_cube_out_datacopy_wrapper.h + * \file copy_cube_out_datacopy.h * \brief */ -#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H -#define IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H +#ifndef IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H +#define IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H #include "../../utils/matmul_module.h" #include "../../utils/matmul_param.h" -#include "copy_cube_out_utils.h" +#include "copy_cube_out_intf.h" +#include "copy_cube_out_datacopy_wrapper.h" namespace AscendC { namespace Impl { namespace Detail { - -constexpr int32_t DOUBLE_SPACE = 2; -constexpr int32_t TWO_TIMES = 2; -constexpr int32_t EIGHT_TIMES = 8; -constexpr int32_t SHIFT_16_BIT = 16; -constexpr int32_t SHIFT_32_BIT = 32; -constexpr int32_t SHIFT_48_BIT = 48; -constexpr uint32_t MAX_REPEAT_STRIDE = 255; -constexpr int32_t PATTERN_SIZE = 8; -constexpr int32_t PATTERN_OFFSET = 2; /* - CopyCubeOutWrapper is considered entirely experimental. + CopyCubeOut is considered entirely experimental. We retain the freedom to make incompatible changes, but do not guarantee the stability. - CopyCubeOutWrapper is only for internal usage, does not support extension or customized specialization! + CopyCubeOut is only for internal usage, does not support extension or customized specialization! */ -template -class CopyCubeOutWrapper +template +class CopyCubeOut::IsNeedUB())>> { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; + using L0cT = typename GetMmDstType::Type; + MATMUL_USE_MODULE(MatmulQuantProcessor); MATMUL_USE_MODULE(MatmulShapeInfo); MATMUL_USE_MODULE(MatmulShapeTiling); MATMUL_USE_MODULE(LocalWorkspace); + MATMUL_USE_MODULE(CopyCubeOutUtils); public: - __aicore__ inline CopyCubeOutWrapper() = default; - __aicore__ inline ~CopyCubeOutWrapper() = default; + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, + int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) + { + CopyOutImpl(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } - // get blockCount by DstT - __aicore__ inline constexpr int32_t GetBlockCount() + template + __aicore__ inline void Copy(const LocalTensor& co2Local, const LocalTensor& co1Local, int32_t curRow, + int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) { - return sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + CopyOutImpl(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); } - // if target is not aligned, must copy the unalign data to trans UB - __aicore__ inline bool IsNeedPadUnalignedToTrans(const int32_t baseWidth, const uint32_t dimN, - const bool isComputeLineByLine, const bool isTargetAligned) + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, + int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + const ScheduleContext& context = 0) { - if constexpr (IsSameType::value) { - bool isOdd = false; - if constexpr (IsSameType::value || IsSameType::value) { - if (baseWidth % TWO_TIMES > 0) { - isOdd = true; - } - } - bool isSingleCore = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() <= MATMUL_MODULE(MatmulShapeInfo)-> - GetSingleCoreM()) && (dimN <= MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN()); - bool isMutiCoreNeedPad = !isSingleCore && !isComputeLineByLine; - return (!isTargetAligned && (isSingleCore || isMutiCoreNeedPad) && !isOdd); + CopyOutImpl(gm, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } + +private: + template + __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, int32_t curRow, int32_t curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyOutNZ2ND(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + CopyOutNZ2NZ(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); } else { - return (!isTargetAligned); + ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); } } - // copy the unalign data to trans UB - template - __aicore__ inline void PadUnalignedToTrans(const LocalTensor& trans, const GlobalTensor& gm, - int32_t dstOffset, bool isComputeLineByLine, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + template + __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co2Local, + const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, + int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { - int32_t alignedSize; - if constexpr (IsSameType::value || IsSameType::value) { - alignedSize = GetC0Size(); + if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyOutNZ2ND(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + CopyOutNZ2NZ(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); } else { - alignedSize = BLOCK_CUBE; + ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); } - int32_t baseUseN = CeilAlign(baseWidth, alignedSize); - int32_t gmTailOffset = dstOffset + baseUseN - GetBlockCount(); - int32_t transTailOffset = baseUseN - GetBlockCount(); + } - auto enQueEvtID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); + template + __aicore__ inline void CopyOutNZ2NZ(const LocalTensor& co2Local, const LocalTensor& co1Local, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() >= MATMUL_MODULE(MatmulShapeTiling)-> + GetTiling().GetBaseM()), { KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be not less than baseM %d", + MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()); + }); - if (isComputeLineByLine) { - if constexpr (enSequentialWrite) { - PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, baseUseN, baseHeight); - } else { - PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, - MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(), baseHeight); - } + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = baseBlockWidth; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.srcStride = 0; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + if constexpr (enSequentialWrite) { + dataCopyInfo.dstStride = 0; + CopyCo12Co2WithQuant(co2Local, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, + enhancedParams); } else { - PadUnalignedToTransWithStride(trans[transTailOffset], gm[gmTailOffset], baseHeight, baseWidth, baseBlockWidth); + dataCopyInfo.dstStride = (CeilAlign(MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(), BLOCK_CUBE) - + baseBlockHeight * BLOCK_CUBE) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; + int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * + MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(); + CopyCo12Co2WithQuant(co2Local[dstOffset], co1Local, curCol, baseBlockHeight, baseBlockWidth, + dataCopyInfo, enhancedParams); } - - // if copy gm to ub, must add the set/wait flag to wait the UB has be writed; - event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); - SetFlag(eventIDMte2ToV); - WaitFlag(eventIDMte2ToV); } - // trans nz buffer to nd buffer - __aicore__ inline auto TransNZ2NDByVec(const LocalTensor& trans, const LocalTensor& localBuf, - int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) + template + __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, + int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) { CopyOutEnQue(); - ASCENDC_ASSERT(((blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) <= MAX_REPEAT_TIMES), { - KERNEL_LOG(KERNEL_ERROR, "blockWidth is %d, blockCount is %d, repeat time exceed max time %d", blockWidth, - GetBlockCount(), MAX_REPEAT_TIMES); - }); - if constexpr (IsSameType::value || IsSameType::value) { - TransNZ2NDByVecDstB8(trans, localBuf, blockHigh, blockWidth, baseHeight, baseWidth, baseBlockWidth); - } else { - TransNZ2NDByVecDstNotB8(trans, localBuf, blockHigh, blockWidth, baseHeight, baseWidth, baseBlockWidth); - } + + LocalTensor localBuf = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); + CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); + CopyOutDeQue(); + + CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } - // copy trans buffer to gm template - __aicore__ inline void CopyTrans2GM(const GlobalTensor& gm, const LocalTensor& trans, - int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, - int32_t dstOffset, int32_t offset, int32_t dstStride, bool isComputeLineByLine, bool isTargetAligned) + __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) { - int32_t blockLen = baseBlockWidth * (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE); - if constexpr (IsSameType::value || IsSameType::value) { - blockLen = Ceil(blockLen, TWO_TIMES); - } - if (!isComputeLineByLine) { - DataCopy(gm[dstOffset], trans, { static_cast(baseHeight), static_cast(blockLen), 0, - static_cast(dstStride) }); - return; - } - if constexpr (IsSameType::value) { - if constexpr (!enSequentialWrite) { - dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * - MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() + curCol * - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); - offset = MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); - } - int32_t newBlockCount; - if constexpr (IsSameType::value || IsSameType::value) { - newBlockCount = BLOCK_CUBE; + CopyOutNZ2NZ(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + } + + template + __aicore__ inline void CopyOutNZ2ND(const LocalTensor& co2Local, const LocalTensor& co1Local, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() == 1), { KERNEL_LOG(KERNEL_ERROR, + "M_ is %d, which should be equal with 1.", MATMUL_MODULE(MatmulShapeInfo)->GetOrgM()); }); + + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; + + if constexpr (enSequentialWrite) { + DataCopy(co2Local, co1Local, dataCopyInfo, enhancedParams); } else { - newBlockCount = ONE_BLK_SIZE / sizeof(DstT); + int32_t dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + DataCopy(co2Local[dstOffset], co1Local, dataCopyInfo, enhancedParams); } - if (isTargetAligned) { - CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, newBlockCount, offset); - } else if (blockLen == 1) { - CopyTrans2GMByVecByLineUnalignOneBlock(gm[dstOffset], trans, baseHeight, baseWidth, blockLen, newBlockCount, offset); + } else { + ASCENDC_ASSERT((!IsSameType::value && !IsSameType::value), + { KERNEL_LOG(KERNEL_ERROR, "Data format should be NZ if GetTensorC to UB when output is int8_t."); }); + + LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); + + CopyCo12Co2WithoutQuant(trans, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + if constexpr(enSequentialWrite) { + TransNZ2NDForDstUB(co2Local, trans, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), + baseHeight, baseBlockWidth, baseBlockHeight); } else { - if constexpr (IsSameType::value || IsSameType::value) { - CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); - } else { - CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); - } + uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? + MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); + int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + TransNZ2NDForDstUB(co2Local[dstOffset], trans, CeilAlign(dimN, + MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()), baseHeight, baseBlockWidth, baseBlockHeight); } - } else { - CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, ONE_BLK_SIZE / sizeof(DstT), offset); } } - // if baseWidth is unaligned, then copy the tail data - __aicore__ inline void CopyLocal2GMNZ2NDOnTheFlyTail(const GlobalTensor& gm, const LocalTensor& localBuf, - int32_t baseHeight, int32_t baseWidth, int32_t iterIdx, int32_t calcWidth, const event_t& eventIDMte3ToMte2) + template + __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co1Local, int32_t curRow, + int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) { - LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->GetNZ2NDWorkspace().template ReinterpretCast(); - trans.SetSize(GetBlockCount()); - int32_t srcTailOffset = iterIdx * GetBlockCount() + calcWidth * GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()); - if (baseWidth * sizeof(DstT) > ONE_BLK_SIZE) { - CopyLocal2GMNZ2NDRegMov(gm, localBuf, trans, baseHeight, baseWidth, iterIdx, calcWidth, srcTailOffset); - } else { - if (iterIdx > 0) { - WaitFlag(eventIDMte3ToMte2); - } - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); - SetFlag(eventID); - WaitFlag(eventID); - } - DataCopy(trans, gm[baseWidth], { 1, 1, 0, 0 }); - event_t eventIDMte2ToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); - SetFlag(eventIDMte2ToMte3); - WaitFlag(eventIDMte2ToMte3); - DataCopy(gm, localBuf[srcTailOffset], { 1, 1, 0, 0 }); - PipeBarrier(); - DataCopy(gm[baseWidth], trans, { 1, 1, 0, 0 }); - if (iterIdx < baseHeight - 1) { - SetFlag(eventIDMte3ToMte2); + CopyOutEnQue(); + + LocalTensor localBuf = MATMUL_MODULE(LocalWorkspace)->GetTempWorkspace(); + CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + CopyOutDeQue(); + + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ || IsSameType::value && + IsSameType::value) { + CopyLocal2GMNZ2NDOnTheFly(gm, localBuf, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } else { + CopyLocal2GMNZ2NDByVec(gm, localBuf, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); } } } -private: - __aicore__ inline void PadUnalignedToTransByLine(const LocalTensor& trans, const GlobalTensor& gm, - int32_t transStride, int32_t gmStride, int32_t baseHeight) + template + __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) { - // copy gm to trans one line by one line - int32_t dstOffset = 0; - int32_t srcOffset = 0; - int32_t blockLen = GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; - for (int32_t i = 0; i < baseHeight; ++i) { - DataCopy(trans[dstOffset], gm[srcOffset], { static_cast(1), - static_cast(blockLen), 0, 0 }); - dstOffset += transStride; - srcOffset += gmStride; + CopyOutNZ2ND(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyOutNZ2ND(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } } - __aicore__ inline void PadUnalignedToTransWithStride(const LocalTensor& trans, const GlobalTensor& gm, int32_t baseHeight, - int32_t baseWidth, int32_t baseBlockWidth) - { - // copy gm to trans with stride - DataCopy(trans, gm, { static_cast(baseHeight), static_cast(1), - static_cast(MATMUL_MODULE(MatmulShapeInfo)->GetOrgN() / GetBlockCount() - 1), - static_cast(baseWidth / GetBlockCount()) }); - } - - __aicore__ inline void TransNZ2NDByVecDstB8(const LocalTensor& trans, const LocalTensor& localBuf, - int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) + __aicore__ inline void CopyCo12Co2WithQuant(const LocalTensor& dst, const LocalTensor& src, + int32_t curCol, int32_t baseBlockHeight, int32_t baseBlockWidth, DataCopyParams& dataCopyInfo, DataCopyEnhancedParams& enhancedParams) { - struct UnaryRepeatParams intriParams; - intriParams.dstBlkStride = Ceil(baseWidth, ONE_BLK_SIZE); - intriParams.srcBlkStride = 1; - uint32_t dstRepStride = Ceil(baseWidth * sizeof(DstT), ONE_BLK_SIZE) * EIGHT_TIMES; - intriParams.dstRepStride = dstRepStride; - intriParams.srcRepStride = (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - int32_t dstOffset = 0; - int32_t srcOffset = 0; - int32_t highBlocks = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES / MAX_REPEAT_TIMES; - int32_t highTail = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES % MAX_REPEAT_TIMES; - uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; - // mov src to dst width aligned - LocalTensor src = localBuf.template ReinterpretCast(); - LocalTensor dst = trans.template ReinterpretCast(); - SetVectorMask(mask[1], mask[0]); - constexpr int64_t srcOffsetStride = BLOCK_CUBE * EIGHT_TIMES; - const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES / TWO_TIMES; - for (int32_t i = 0; i < Ceil(blockWidth, TWO_TIMES); ++i) { - if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { - // if the baseWidth is not aligned, set the mask value; - if (i == (Ceil(blockWidth, TWO_TIMES) - 1) && (baseWidth % GetBlockCount() != 0)) { - uint64_t masktail = (1 << (Ceil(baseWidth % GetBlockCount(), TWO_TIMES))) - 1; - mask[0] = - masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); - mask[1] = mask[0]; - SetVectorMask(mask[1], mask[0]); - } - } - int32_t dstMulsOffset = dstOffset; - for (int32_t j = 0; j < highBlocks; ++j) { - Muls( - dst[dstMulsOffset], src[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES, intriParams); - srcOffset += MAX_REPEAT_TIMES * BLOCK_CUBE; - dstMulsOffset += blockWidth * GetBlockCount() * MAX_REPEAT_TIMES; - } - if (highTail > 0) { - if (dstRepStride > MAX_REPEAT_STRIDE) { - int32_t tmpSrcOffset = srcOffset; - for (int32_t j = 0; j < highTail; j++) { - Muls(dst[dstMulsOffset], src[tmpSrcOffset], (int16_t)1, mask, 1, intriParams); - dstMulsOffset += dstOffsetStride; - tmpSrcOffset += srcOffsetStride; + if constexpr (IsSameType::value) { + MATMUL_MODULE(MatmulQuantProcessor)->UpdateDataCopyParamForQuant(enhancedParams, curCol); + uint64_t alignedHeight = baseBlockHeight * BLOCK_CUBE; + int32_t blockOffset = BLOCK_CUBE * alignedHeight; + if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8 || + MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { + dataCopyInfo.blockLen = baseBlockHeight; + uint64_t addr = enhancedParams.deqTensorAddr; + int32_t offset = ONE_BLK_SIZE * alignedHeight; + int32_t dstOffset = 0; + constexpr int32_t WIDTH_SIZE = ONE_BLK_SIZE * ONE_BYTE_BIT_SIZE; + constexpr int32_t STORE_SIZE = BLOCK_CUBE * ONE_BYTE_BIT_SIZE; + for (int32_t i = 0; i < Ceil(baseBlockWidth, TWO_TIMES); ++i) { + for (int32_t storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { + if (baseBlockWidth % TWO_TIMES != 0 && + i == Ceil(baseBlockWidth, TWO_TIMES) - 1 && + storeMode == 1) { + continue; + } + if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { + enhancedParams.deqTensorAddr = addr + i * WIDTH_SIZE + storeMode * STORE_SIZE; + } + enhancedParams.sidStoreMode = (uint8_t)storeMode; + DataCopy(dst[dstOffset], src[dstOffset + storeMode * blockOffset], + dataCopyInfo, enhancedParams); } - } else { - Muls(dst[dstMulsOffset], src[srcOffset], (int16_t)1, mask, highTail, intriParams); + dstOffset += offset; + } + } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.dstStride = 0; + uint64_t addr = enhancedParams.deqTensorAddr; + int32_t offset = 0; + constexpr int32_t DEQ_OFFSET = 128; + for (int32_t i = 0; i < baseBlockWidth; ++i) { + enhancedParams.deqTensorAddr = addr + i * DEQ_OFFSET; + DataCopy(dst[offset], src[offset], dataCopyInfo, enhancedParams); + offset += blockOffset; } - srcOffset += highTail * BLOCK_CUBE * EIGHT_TIMES; + } else { + DataCopy(dst, src, dataCopyInfo, enhancedParams); } - dstOffset += BLOCK_CUBE; + } else { + DataCopy(dst, src, dataCopyInfo, enhancedParams); } } - __aicore__ inline void TransNZ2NDByVecDstNotB8(const LocalTensor& trans, const LocalTensor& localBuf, - int32_t blockHigh, int32_t blockWidth, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth) + __aicore__ inline void CopyCo12Co2WithoutQuant(const LocalTensor& dst, const LocalTensor& src, int32_t curCol, + int32_t baseBlockHeight, int32_t baseBlockWidth) { - struct UnaryRepeatParams intriParams; - intriParams.srcBlkStride = 1; - int32_t dstOffset = 0; - int32_t srcOffset = 0; - int32_t highBlocks = 0; - int32_t highTail = 0; - int32_t srcStride = MAX_REPEAT_TIMES * GetBlockCount(); - int32_t dstStride = blockWidth * GetBlockCount() * MAX_REPEAT_TIMES; - bool isBeyondMaxStride = false; - uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; - - if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { - intriParams.dstBlkStride = 1; - intriParams.dstRepStride = blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; - intriParams.srcRepStride = GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; - highBlocks = (blockHigh * GetBlockCount()) / MAX_REPEAT_TIMES; - highTail = (blockHigh * GetBlockCount()) % MAX_REPEAT_TIMES; - mask[0] = static_cast((1<< GetBlockCount()) - 1); - mask[1] = 0; + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = baseBlockWidth; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.srcStride = 0; + dataCopyInfo.dstStride = 0; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + if constexpr (IsSameType::value) { + MATMUL_MODULE(MatmulQuantProcessor)->UpdateDataCopyParamForQuant(enhancedParams, curCol); + } + DataCopy(dst, src, dataCopyInfo, enhancedParams); + } + + __aicore__ inline void CopyCo12Local(const LocalTensor& localBuf, const LocalTensor& co1Local, int32_t curCol, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; + DataCopyEnhancedParams enhancedParams; + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; } else { - intriParams.dstBlkStride = blockWidth; - uint32_t dstRepStride = (blockWidth * GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - intriParams.dstRepStride = dstRepStride; - if (dstRepStride > MAX_REPEAT_STRIDE) { - isBeyondMaxStride = true; - } - intriParams.srcRepStride = (GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - highBlocks = (blockHigh * GetBlockCount()) / EIGHT_TIMES / MAX_REPEAT_TIMES; - highTail = (blockHigh * GetBlockCount()) / EIGHT_TIMES % MAX_REPEAT_TIMES; - srcStride *= EIGHT_TIMES; - dstStride *= EIGHT_TIMES; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + ASCENDC_ASSERT((localBuf.GetSize() >= dataCopyInfo.blockLen * CUBE_MAX_SIZE), { + KERNEL_LOG(KERNEL_ERROR, "copy len is %d, which should be less than dst size %d", + dataCopyInfo.blockLen * CUBE_MAX_SIZE, localBuf.GetSize()); + }); } - SetVectorMask(mask[1], mask[0]); - const int64_t srcOffsetStride = GetBlockCount() * EIGHT_TIMES; - const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES; - for (int32_t i = 0; i < blockWidth; ++i) { - if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { - // if the baseWidth is not aligned, set the mask value; - if (i == (blockWidth - 1) && (baseWidth % GetBlockCount() != 0)) { - uint64_t masktail = (1 << (baseWidth % GetBlockCount())) - 1; - mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); - mask[1] = mask[0]; - SetVectorMask(mask[1], mask[0]); - } + CopyCo12Co2WithQuant(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, enhancedParams); + } + + __aicore__ inline void CopyLocal2GMNZ2NZNotSeq(const GlobalTensor& gm, const LocalTensor& localBuf, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + int64_t alignM; + int alignBaseUseM; + if constexpr (C_TYPE::format == CubeFormat::NZ) { // nz2nz + alignM = Ceil(MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), BLOCK_CUBE) * BLOCK_CUBE; + alignBaseUseM = Ceil(baseHeight, BLOCK_CUBE) * BLOCK_CUBE; + } else { // nz2nd A is vector + alignM = MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(); + alignBaseUseM = baseHeight; + } + + int64_t dstOffset; + int64_t dstStride; + int blockLen; + int blockCount; + if constexpr (IsSameType::value || IsSameType::value) { + dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + + curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * ONE_BLK_SIZE; + dstStride = (alignM - alignBaseUseM) * sizeof(DstT); + blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT); + blockCount = Ceil(baseBlockWidth, TWO_TIMES); + } else { + dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + + curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE; + dstStride = (alignM - alignBaseUseM) * sizeof(DstT) * BLOCK_CUBE / ONE_BLK_SIZE; + blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT) * + BLOCK_CUBE / ONE_BLK_SIZE; + blockCount = baseBlockWidth; + } + + if (dstStride >= UINT16_MAX) { + int32_t srcOffset = 0; + int32_t srcStride; + if constexpr (IsSameType::value || IsSameType::value) { + dstStride = alignM * ONE_BLK_SIZE; + srcStride = baseHeight * ONE_BLK_SIZE; + } else { + dstStride = alignM * BLOCK_CUBE; + srcStride = baseHeight * BLOCK_CUBE; } - int32_t dstMulsOffset = dstOffset; - for (int32_t j = 0; j < highBlocks; ++j) { - Muls(trans[dstMulsOffset], localBuf[srcOffset], (DstT)1.0, mask, MAX_REPEAT_TIMES, intriParams); + for (int32_t i = 0; i < blockCount; ++i) { + DataCopy(gm[dstOffset], localBuf[srcOffset], { 1, static_cast(blockLen), 0, 0 }); + dstOffset += dstStride; srcOffset += srcStride; - dstMulsOffset += dstStride; } - if (highTail) { - if (isBeyondMaxStride) { - for (int32_t j = 0; j < highTail; j++) { - Muls(trans[dstMulsOffset + j * dstOffsetStride], - localBuf[srcOffset + j * srcOffsetStride], (DstT)1.0, mask, 1, intriParams); - } - } else { - Muls(trans[dstMulsOffset], localBuf[srcOffset], (DstT)1.0, mask, highTail, intriParams); - } - if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { - srcOffset += highTail * GetBlockCount(); - } else { - srcOffset += highTail * srcOffsetStride; - } - } - dstOffset += GetBlockCount(); + } else { + DataCopy(gm[dstOffset], localBuf, { static_cast(blockCount), static_cast(blockLen), 0, + static_cast(dstStride) }); } } - __aicore__ inline void CopyTrans2GMByVecByLineAlign(const GlobalTensor& gm, const LocalTensor& trans, int32_t baseHeight, - int32_t blockLen, int32_t blockCount, int32_t offset) + __aicore__ inline void CopyLocal2GMNZ2NZSeq(const GlobalTensor& gm, const LocalTensor& localBuf, int32_t baseHeight, int32_t baseBlockWidth) { - int32_t dstOffset = 0; - int32_t srcOffset = 0; - int32_t blockOffset = blockLen * blockCount; - for (int32_t i = 0; i < baseHeight; ++i) { - DataCopy(gm[dstOffset], trans[srcOffset], - { 1, static_cast(blockLen), 0, 0 }); - PipeBarrier(); - dstOffset += offset; - srcOffset += blockOffset; - } + int32_t blockLen = baseHeight * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; + DataCopy(gm, localBuf, { static_cast(baseBlockWidth), + static_cast(blockLen), 0, 0 }); } - __aicore__ inline void CopyTrans2GMByVecByLineUnalignOneBlock(const GlobalTensor& gm, const LocalTensor& trans, - int32_t baseHeight, int32_t baseWidth, int32_t blockLen, int32_t blockCount, int32_t offset) + template + __aicore__ inline void CopyLocal2GMNZ2NZ(const GlobalTensor& gm, const LocalTensor& localBuf, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { - CopyTrans2GMEnQue(); - int32_t padLen = (ONE_BLK_SIZE - baseWidth * sizeof(DstT)) / sizeof(DstT); - SetAtomicAdd(); - int32_t dstOffset = 0; - for (int32_t i = 0; i < baseHeight; ++i) { - LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template - GetWorkspaceWithOffset(0) - .template ReinterpretCast(); - int32_t transIndex = i * blockLen * blockCount; - for (int32_t j = 0; j < baseWidth; ++j) { - transAligin.SetValue(j, trans.GetValue(transIndex + j)); - } - for (int32_t j = baseWidth; j < blockCount; ++j) { - transAligin.SetValue(j, 0); - } - DataCopy(gm[dstOffset], transAligin, { 1, 1, 0, 0 }); - dstOffset += offset; - CopyLocal2GMNZ2NDDeQue(); + if constexpr (enSequentialWrite) { + CopyLocal2GMNZ2NZSeq(gm, localBuf, baseHeight, baseBlockWidth); + } else { + ASCENDC_ASSERT((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() >= baseHeight), { + KERNEL_LOG(KERNEL_ERROR, "M_ is %d, baseHeight is %d, M_ should be no less than baseHeight", + MATMUL_MODULE(MatmulShapeInfo)->GetOrgM(), baseHeight); + }); + CopyLocal2GMNZ2NZNotSeq(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } - SetAtomicNone(); } - template - __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, - int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, int32_t blockLen, - int32_t blockCount, int32_t offset) -> enable_if_t + __aicore__ inline void TransNZ2NDForDstUB(const LocalTensor& co2Local, const LocalTensor& trans, + int32_t dstStride, int32_t baseHeight, int32_t baseBlockWidth, int32_t baseBlockHeight) { - LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template - GetWorkspaceWithOffset(0).template ReinterpretCast(); - int32_t remainLen = (baseWidth % blockCount) / TWO_TIMES; - CopyTrans2GMEnQue(); - LocalTensor src1Pattern; - src1Pattern = MATMUL_MODULE(LocalWorkspace)->template GetWorkspaceWithOffset< - ToMatmulConfig(MM_CFG).enableUBReuse>(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() - / TWO_TIMES).template ReinterpretCast(); - LocalTensor tmpSrc = trans.template ReinterpretCast(); - src1Pattern.SetSize(PATTERN_SIZE); - src1Pattern.SetValue(0, 0xFFFF << remainLen); - src1Pattern.SetValue(1, (1 << remainLen) - 1); - for (int32_t i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { - src1Pattern.SetValue(i, 0); - } - int32_t orinRemain = baseWidth % blockCount; - int32_t gmOffset = blockCount * (blockLen - PATTERN_OFFSET); + DataCopyParams dataCopyInfo { + static_cast(baseBlockWidth), + static_cast(MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE), + static_cast((baseBlockHeight * BLOCK_CUBE * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() - + MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) * sizeof(DstT) / ONE_BLK_SIZE), + 0 + }; int32_t dstOffset = 0; int32_t srcOffset = 0; - int32_t blockOffset = blockLen * blockCount; - for (int32_t i = 0; i < baseHeight; ++i) { - DataCopy(gm[dstOffset], trans[srcOffset], { 1, static_cast(blockLen - 1), 0, 0 }); - if (baseWidth % TWO_TIMES == 0) { - CopyOutEnQue(); - GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); - uint64_t rsvdCnt = 0; - GatherMask(transAligin, tmpSrc[((i + 1) * blockLen - PATTERN_OFFSET) * BLOCK_CUBE], - src1Pattern, false, 0, gatherMaskParams, rsvdCnt); - LocalTensor tmpTrans = transAligin.template ReinterpretCast(); - DataCopy(gm[dstOffset + gmOffset + remainLen * DOUBLE_SPACE], tmpTrans, { 1, 1, 0, 0 }); - } else { - CopyLocal2GMNZ2NDDeQue(); - LocalTensor tmpTrans = transAligin.template ReinterpretCast(); - for (int32_t j = 0; j < ONE_BLK_SIZE; ++j) { - tmpTrans.SetValue(j, trans[srcOffset + gmOffset + orinRemain].GetValue(j)); - } - CopyLocal2GMNZ2NDEnQue(); - DataCopy(gm[dstOffset + gmOffset + orinRemain], tmpTrans, { 1, 1, 0, 0 }); - } - PipeBarrier(); - dstOffset += offset; - srcOffset += blockOffset; + for (int32_t i = 0; i < baseHeight; i++) { + DataCopy(co2Local[dstOffset], trans[srcOffset], dataCopyInfo); + dstOffset += dstStride; + srcOffset += MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); } } - template - __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, - int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, int32_t blockLen, - int32_t blockCount, int32_t offset) -> enable_if_t + template + __aicore__ inline void CopyLocal2GMNZ2NDByVec(const GlobalTensor& gm, const LocalTensor& localBuf, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { - LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template - GetWorkspaceWithOffset(0).template ReinterpretCast(); - int32_t remainLen = baseWidth % blockCount; - CopyTrans2GMEnQue(); - LocalTensor src1Pattern; - src1Pattern = MATMUL_MODULE(LocalWorkspace)->template + uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? + MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); + + LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->template GetWorkspaceWithOffset( - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() / TWO_TIMES).template ReinterpretCast(); - src1Pattern.SetSize(PATTERN_SIZE); - src1Pattern.SetValue(0, 0xFFFF << remainLen); - src1Pattern.SetValue(1, (1 << remainLen) - 1); - for (int32_t i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { - src1Pattern.SetValue(i, 0); + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()) + .template ReinterpretCast(); + int32_t transSize = localBuf.GetSize(); + if constexpr (IsSameType::value || IsSameType::value) { + if (baseBlockWidth % TWO_TIMES != 0) { + transSize += baseBlockHeight * CUBE_MAX_SIZE; + } } - int32_t gmOffset = blockCount * (blockLen - PATTERN_OFFSET); - int32_t dstOffset = 0; - int32_t srcOffset = 0; - int32_t blockOffset = blockLen * blockCount; - for (int32_t i = 0; i < baseHeight; ++i) { - DataCopy(gm[dstOffset], trans[srcOffset], { 1, static_cast(blockLen - 1), 0, 0 }); - GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); - uint64_t rsvdCnt = 0; - CopyOutEnQue(); - GatherMask(transAligin, trans[srcOffset + gmOffset], - src1Pattern, false, 0, gatherMaskParams, rsvdCnt); - DataCopy(gm[dstOffset + gmOffset + remainLen], transAligin, { 1, 1, 0, 0 }); - PipeBarrier(); - dstOffset += offset; - srcOffset += blockOffset; + trans.SetSize(transSize); + + int32_t dstOffset; + int32_t dstStride; + int32_t offset; + bool isGmAligned; + if constexpr (enSequentialWrite) { + dstOffset = 0; + dstStride = 0; + offset = baseWidth; + isGmAligned = ((baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0); + } else { + int32_t width = baseBlockWidth * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); + if constexpr (IsSameType::value || IsSameType::value) { + width = width / TWO_TIMES; + } + ASCENDC_ASSERT((dimN >= width), + { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + isGmAligned = 1; + } else { + isGmAligned = ((dimN % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0 && + (MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN() % + MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0); + } + + dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; + offset = dimN; + } + bool isTargetAligned = (baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) == 0; + const bool isComputeLineByLine = (!isGmAligned || dstStride >= UINT16_MAX); + + // 1 if target is not aligned, must copy the unalign data to trans UB + if (MATMUL_MODULE(CopyCubeOutUtils) + ->IsNeedPadUnalignedToTrans(baseWidth, dimN, isComputeLineByLine, isTargetAligned)) { + MATMUL_MODULE(CopyCubeOutUtils) + ->template PadUnalignedToTrans( + trans, gm, dstOffset, isComputeLineByLine, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } + + // 2. trans nz buffer to nd buffer + MATMUL_MODULE(CopyCubeOutUtils) + ->TransNZ2NDByVec(trans, localBuf, baseBlockHeight, baseBlockWidth, baseHeight, baseWidth, baseBlockWidth); + + // 3. copy trans buffer to gm + MATMUL_MODULE(CopyCubeOutUtils)->template CopyTrans2GM(gm, trans, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth, dstOffset, offset, dstStride, isComputeLineByLine, isTargetAligned); } - __aicore__ inline void CopyLocal2GMNZ2NDRegMov(const GlobalTensor& gm, const LocalTensor& localBuf, - LocalTensor& trans, int32_t baseHeight, int32_t baseWidth, int32_t iterIdx, int32_t calcWidth, - int32_t srcTailOffset) + template + __aicore__ inline void CopyLocal2GMNZ2NDOnTheFly(const GlobalTensor& gm, const LocalTensor& localBuf, + int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) { - int32_t dstTailOffset = calcWidth * GetBlockCount(); - int32_t basicOffset = 0; - if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { - DataCopy(gm[dstTailOffset], localBuf[srcTailOffset], { 1, 1, 0, 0 }); - basicOffset = ONE_BLK_SIZE / sizeof(DstT); - } + uint32_t dimN = (MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() != 0) ? + MATMUL_MODULE(MatmulShapeInfo)->GetOrgKc() : MATMUL_MODULE(MatmulShapeInfo)->GetOrgN(); + int32_t calcWidth = baseWidth / MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); + int32_t dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + int32_t blockLen = MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() * sizeof(DstT) / ONE_BLK_SIZE; + int32_t srcRepeatGap = (baseBlockHeight * BLOCK_CUBE * MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount() - + MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()) * sizeof(DstT) / ONE_BLK_SIZE; + int32_t tail = baseWidth % MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); - // reg_mov - srcTailOffset = srcTailOffset + basicOffset - - GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()) + baseWidth % GetBlockCount(); - dstTailOffset = dstTailOffset + basicOffset + baseWidth % GetBlockCount() - GetBlockCount(); - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); - SetFlag(eventID); - WaitFlag(eventID); + int32_t offset = dimN; + if constexpr (enSequentialWrite) { + dstOffset = 0; + offset = baseWidth; } - int32_t j = 0; - for (int32_t k = 0; k < GetBlockCount() - baseWidth % GetBlockCount(); j++, k++) { - DstT scalar = localBuf.GetValue(srcTailOffset + k); - trans.SetValue(j, scalar); - } - srcTailOffset = iterIdx * GetBlockCount() + calcWidth * GetBlockCount() * CeilAlign(baseHeight, GetBlockCount()); - for (int32_t k = 0; k < baseWidth % GetBlockCount(); j++, k++) { - DstT scalar = localBuf.GetValue(srcTailOffset + k); - trans.SetValue(j, scalar); + + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + offset = CeilAlign(offset, MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount()); + calcWidth = baseBlockWidth; + tail = 0; } - CopyLocal2GMNZ2NDEnQue(); - // copy the tail from ub to gm - DataCopy(gm[dstTailOffset], trans, { 1, 1, 0, 0 }); - if constexpr (IsSameType::value && - IsSameType::value) { - CopyLocal2GMNZ2NDDeQue(); + // Allocate MTE2_MTE3 eventId: eventIDMte3ToMte2 + event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->AllocEventID()); + int32_t srcOffset = 0; + for (int32_t i = 0; i < baseHeight; i++) { + if (calcWidth > 0) { + DataCopy(gm[dstOffset], localBuf[srcOffset], { static_cast(calcWidth), + static_cast(blockLen), static_cast(srcRepeatGap), 0 }); + if constexpr (IsSameType::value && + IsSameType::value) { + PipeBarrier(); + } + } + + if (tail != 0) { + MATMUL_MODULE(CopyCubeOutUtils)->CopyLocal2GMNZ2NDOnTheFlyTail( + gm[dstOffset], localBuf, baseHeight, baseWidth, i, calcWidth, eventIDMte3ToMte2); + } + dstOffset += offset; + srcOffset += MATMUL_MODULE(CopyCubeOutUtils)->GetBlockCount(); } + event_t eventID = static_cast(GetTPipePtr()->FetchEventID()); + SetFlag(eventID); + WaitFlag(eventID); + // Release MTE2_MTE3 eventId: eventIDMte3ToMte2 + GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte2); } }; } // namespace Detail } // namespace Impl } // namespace AscendC -#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_WRAPPER_H \ No newline at end of file +#endif // IMPL_MATMUL_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H \ No newline at end of file diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h index 9bee0e96..93cf61fb 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_fixpipe.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -31,11 +31,11 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. CopyCubeOut is only for internal usage, does not support extension or customized specialization! */ -template -class CopyCubeOut::IsNeedUB())>> +template +class CopyCubeOut::IsNeedUB())>> { using DstT = typename C_TYPE::T; - using SrcT = typename GetDstType::Type; + using SrcT = typename GetMmDstType::Type; using FixpipeAdaptor = FixpipeParamsUtil::GetFixpipeParamsType()>; MATMUL_USE_MODULE(Context); @@ -44,6 +44,7 @@ class CopyCubeOut& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { + if constexpr ((FIXPIPE_MODE == McgShfMode::DUAL_DST_SPLIT_M || (A_TYPE::ibShare && B_TYPE::ibShare)) && + PhyPosIsUB(C_TYPE::pos) && MatmulFeatureTrait::IsSupportL0CToUB() && + FIXPIPE_MODE != McgShfMode::DUAL_DST_SPLIT_N) { + baseHeight = Align(baseHeight, DOUBLE_SIZE); + } if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { CopyOutNZ2ND(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + static_assert(!(FIXPIPE_MODE == McgShfMode::DUAL_DST_SPLIT_N && (A_TYPE::ibShare && B_TYPE::ibShare) && + PhyPosIsUB(C_TYPE::pos) && MatmulFeatureTrait::IsSupportL0CToUB()), + "if split N when copy cube out, NZ is not supported"); CopyOutNZ2NZ(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); +#if defined(__DAV_C310__) + } else if constexpr (C_TYPE::format == CubeFormat::COLUMN_MAJOR) { + CopyOutNZ2DN(dst, co1Local, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth); +#endif } else { ASCENDC_ASSERT(false, {KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type.");}); } @@ -107,19 +121,79 @@ private: int64_t dstOffset = 0; if constexpr (!enSequentialWrite) { stride = GetOrgWidth(); - dstOffset = static_cast(static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * stride) + - static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + if constexpr (!IsBasic(MM_CFG)) { + dstOffset = GetDstOffset(curRow, curCol, baseHeight, stride); + } + } + if constexpr (FIXPIPE_MODE == McgShfMode::DUAL_DST_SPLIT_N && PhyPosIsUB(C_TYPE::pos)) { + if (IsBasic(MM_CFG) && !NoTailN(MM_CFG)) { + stride = baseWidth >> 1; + } else { + stride = stride >> 1; + } } - FixpipeAdaptor fixpipe(baseWidth, - baseHeight, - baseBlockWidth, - baseBlockHeight, - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), - stride); + FixpipeAdaptor fixpipe(baseWidth, baseHeight, baseBlockWidth, baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), stride); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + fixpipe.SetNdParams(MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(), baseHeight, baseWidth, baseBlockWidth, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + } + SetFixpipeParams(fixpipe); + AlignHeightWithTrans(fixpipe, baseHeight); CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); } + __aicore__ inline void AlignHeightWithTrans(FixpipeAdaptor& fixpipe, int32_t mSize) + { +#if defined(__DAV_C310__) + if constexpr(IsMxTransEnableWithND) { + constexpr int32_t c0Size = AuxGetC0Size(); + if (MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA()) { + fixpipe.params_.srcStride = Align(mSize, c0Size); + } + } +#endif + } + + __aicore__ inline int64_t GetDstOffset(int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t stride) + { + int64_t dstOffset = 0; + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && PhyPosIsUB(C_TYPE::pos) && + (A_TYPE::ibShare && B_TYPE::ibShare)) { + if constexpr (FIXPIPE_MODE != McgShfMode::DUAL_DST_SPLIT_N) { + dstOffset = (static_cast (static_cast( + curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * stride)) >> 1) + + static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + } else { + dstOffset = + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * stride) + + static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * baseHeight); + dstOffset = dstOffset >> 1; + } + } else { + dstOffset = + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * stride)+ + static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + } + return dstOffset; + } + + __aicore__ inline void SetFixpipeParams(FixpipeAdaptor &fixpipe) { + if constexpr (PhyPosIsUB(C_TYPE::pos) && MatmulFeatureTrait::IsSupportL0CToUB()) { + fixpipe.SetSubBlockId(MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx()); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare && + FIXPIPE_MODE == McgShfMode::SINGLE_DST_MODE) { + fixpipe.SetMcgShfMode(McgShfMode::DUAL_DST_SPLIT_M); + } else { + fixpipe.SetMcgShfMode(FIXPIPE_MODE); + } + } + } +#if defined(__DAV_C310__) + constexpr static uint32_t B4_CHANNEL_MERGE_FACTOR = 4; + constexpr static uint32_t B8_CHANNEL_MERGE_FACTOR = 2; template __aicore__ inline void CopyOutNZ2NZ(const T& dst, const LocalTensor& co1Local, int32_t curRow, int32_t curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, @@ -127,12 +201,86 @@ private: { int64_t dstOffset = 0; uint32_t stride = 0; + constexpr uint32_t CHANNEL_MERGE_FACTOR = IsSuppportB4() ? B4_CHANNEL_MERGE_FACTOR : + ((IsTypeOneOfV || IsSuppportB8()) ? B8_CHANNEL_MERGE_FACTOR : 1); + if constexpr (!enSequentialWrite) { + stride = static_cast(GetOrgM() * CHANNEL_MERGE_FACTOR * BLOCK_CUBE); + if constexpr (!IsBasic(MM_CFG)) { + dstOffset = static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) * GetOrgM() + + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * CHANNEL_MERGE_FACTOR * BLOCK_CUBE; + } + if constexpr (PhyPosIsUB(C_TYPE::pos) && ((A_TYPE::ibShare && B_TYPE::ibShare) || + FIXPIPE_MODE == McgShfMode::DUAL_DST_SPLIT_M || FIXPIPE_MODE == McgShfMode::DUAL_DST_SPLIT_N)) { + dstOffset = dstOffset >> 1; + } + } else { + stride = static_cast((baseBlockHeight * BLOCK_CUBE) * CHANNEL_MERGE_FACTOR * BLOCK_CUBE); + } + FixpipeAdaptor fixpipe(baseWidth, baseHeight, baseBlockWidth, baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), stride); + SetFixpipeParams(fixpipe); + CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); + } + + template + __aicore__ inline void CopyTensor(const T& dst, const LocalTensor& co1Local, + FixpipeAdaptor& fixpipe, const int32_t curN = 0, const int32_t baseUseN = 0) + { + if (MATMUL_MODULE(MatmulQuantProcessor)->IsQuantSenario()) { + fixpipe.SetQuantMode(MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode()); + if (MATMUL_MODULE(MatmulQuantProcessor)->IsPerChannelSenario()) { + LocalTensor quantTensor; + MATMUL_MODULE(MatmulQuantProcessor)->CopyQuantTensor(quantTensor, curN, baseUseN); + fixpipe.template FixpipeOut(dst, co1Local, quantTensor); + MATMUL_MODULE(MatmulQuantProcessor)->FreeQuantTensor(quantTensor); + } else { + fixpipe.SetQuantScalar(MATMUL_MODULE(MatmulQuantProcessor)->GetQuantScalarValue()); + fixpipe.template FixpipeOut(dst, co1Local); + } + } else { + fixpipe.SetCastMode(); + fixpipe.template FixpipeOut(dst, co1Local); + } + } + + template + __aicore__ inline uint32_t GetOrgHeight() + { + return GetOrgM(); + } + + template + __aicore__ inline void CopyOutNZ2DN(const T& dst, const LocalTensor& co1Local, int32_t curRow, int32_t curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + static_assert(PhyPosIsGM(C_TYPE::pos), "Unsupported TPosition for output matrix."); + auto stride = baseHeight; + int64_t dstOffset = 0; + if constexpr (!enSequentialWrite) { + stride = GetOrgHeight(); + if constexpr (!IsBasic(MM_CFG)) { + dstOffset = static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * + stride)+ static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()); + } + } + FixpipeAdaptor fixpipe(baseWidth, baseHeight, baseBlockWidth, baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), stride); + SetFixpipeParams(fixpipe); + CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); + } +#else + template + __aicore__ inline void CopyOutNZ2NZ(const T& dst, const LocalTensor& co1Local, int32_t curRow, int32_t curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + int64_t dstOffset = 0; + uint32_t stride; if constexpr (!enSequentialWrite) { if constexpr (!ToMatmulConfig(MM_CFG).isEnableChannelSplit) { dstOffset = static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) * GetOrgM() + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * BLOCK_CUBE; - stride = static_cast((GetOrgM() - baseHeight) * - BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); + stride = static_cast(GetOrgM() * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); } else { dstOffset = static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) * Ceil(GetOrgM(), BLOCK_CUBE) * BLOCK_CUBE + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * B32_C0SIZE; @@ -141,7 +289,7 @@ private: } } else { if constexpr (!ToMatmulConfig(MM_CFG).isEnableChannelSplit) { - stride = static_cast((baseBlockHeight * BLOCK_CUBE - baseHeight) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); + stride = static_cast((baseBlockHeight * BLOCK_CUBE) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); } else { stride = static_cast((baseBlockHeight * BLOCK_CUBE - baseHeight) * B32_C0SIZE * sizeof(DstT) / ONE_BLK_SIZE); } @@ -154,12 +302,12 @@ private: baseHeight = CeilAlign(baseHeight, BLOCK_CUBE); } } - FixpipeAdaptor fixpipe(baseWidth, - baseHeight, - baseBlockWidth, - baseBlockHeight, - MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), - stride); + if constexpr (ToMatmulConfig(MM_CFG).isNBatchOut) { + baseWidth *= MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(); + baseBlockWidth *= MATMUL_MODULE(BatchLoop)->GetBatchOutCacheNum(); + } + FixpipeAdaptor fixpipe(baseWidth, baseHeight, baseBlockWidth, baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), stride); CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); } @@ -168,7 +316,7 @@ private: FixpipeAdaptor& fixpipe, const int32_t curN = 0, const int32_t baseUseN = 0) { fixpipe.SetCastMode(); - if constexpr (IsQuantSenario()) { + if constexpr (IsQuantSenario()) { fixpipe.SetQuantMode(MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode()); LocalTensor quantTensor; if (MATMUL_MODULE(MatmulQuantProcessor)->IsPerChannelSenario()) { @@ -183,7 +331,7 @@ private: fixpipe.template FixpipeOut(dst, co1Local); } } - +#endif template __aicore__ inline uint32_t GetOrgWidth() { @@ -283,8 +431,7 @@ private: dstStrideIn = baseWidth; nSize = static_cast(baseWidth); } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - dstStrideIn = static_cast((baseBlockHeight * BLOCK_CUBE - - baseHeight) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); + dstStrideIn = static_cast(baseBlockHeight * BLOCK_CUBE * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); nSize = 0; } } else { @@ -295,7 +442,7 @@ private: nSize = static_cast(baseWidth); dstOffset = static_cast(static_cast(curRow * baseM) * dstStrideIn) + static_cast(curCol * baseN); } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - dstStrideIn = static_cast((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() - baseHeight) * + dstStrideIn = static_cast((MATMUL_MODULE(MatmulShapeInfo)->GetOrgM()) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); nSize = 0; dstOffset = curCol * baseN * MATMUL_MODULE(MatmulShapeInfo)->GetOrgM() + curRow * baseM * BLOCK_CUBE; diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_intf.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_intf.h index b952c5b0..c78e2540 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_intf.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_intf.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -26,11 +26,12 @@ namespace Detail { We retain the freedom to make incompatible changes, but do not guarantee the stability. CopyCubeOut is only for internal usage, does not support extension or customized specialization! */ -template +template class...> class CopyCubeOut { using DstT = typename C_TYPE::T; - using SrcT = typename GetDstType::Type; + using SrcT = typename GetMmDstType::Type; public: /** diff --git a/impl/matmul/stage/copy_cube_out/copy_cube_out_utils.h b/impl/matmul/stage/copy_cube_out/copy_cube_out_utils.h index 8daca58e..99079116 100644 --- a/impl/matmul/stage/copy_cube_out/copy_cube_out_utils.h +++ b/impl/matmul/stage/copy_cube_out/copy_cube_out_utils.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -24,57 +24,39 @@ namespace Detail { const static uint8_t FIX_PIPE_UNIT_FLAG = 3; template -struct FixpipeParamsUtil { - using DstT = typename C_TYPE::T; - using SrcT = typename GetDstType::Type; - using TYPE = FixpipeParamsV220; - -public: - __aicore__ inline ~FixpipeParamsUtil() = default; - - __aicore__ inline FixpipeParamsUtil(int32_t nSize, int32_t mSize, - int32_t nSizeBlock, int32_t mSizeBlock, int32_t baseHeight, int32_t dstStride) - {} - - __aicore__ inline void SetQuantMode(QuantMode_t quantMode) {} - - __aicore__ inline void SetQuantScalar(uint64_t scalar) {} - - template - __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal, - const LocalTensor& quantTensor) {} - - template - __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal) {} - -public: - TYPE params_; -}; - - -template -struct FixpipeParamsUtil +struct FixpipeParamsUtil { using DstT = typename C_TYPE::T; - using SrcT = typename GetDstType::Type; + using SrcT = typename GetMmDstType::Type; +#if defined(__DAV_C310__) + using TYPE = typename AscendC::Conditional, + typename AscendC::Conditional, FixpipeParamsC310>::type>::type; +#else using TYPE = FixpipeParamsV220; - +#endif public: __aicore__ inline ~FixpipeParamsUtil() = default; __aicore__ inline FixpipeParamsUtil(int32_t nSize, int32_t mSize, int32_t nSizeBlock, int32_t mSizeBlock, int32_t baseHeight, int32_t dstStride) { - if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::COLUMN_MAJOR) { params_.nSize = static_cast(nSize); + } else if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + constexpr uint32_t blockCount = ONE_BLK_SIZE / sizeof(DstT); + params_.nSize = static_cast(Ceil(nSize, blockCount) * blockCount); + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB()) { + dstStride = static_cast(Ceil(dstStride, blockCount) * blockCount); + } } else if constexpr (C_TYPE::format == CubeFormat::NZ) { if constexpr (!ToMatmulConfig(MM_CFG).isEnableChannelSplit) { params_.nSize = static_cast(nSizeBlock * BLOCK_CUBE); - dstStride = dstStride + static_cast(mSize * BLOCK_CUBE * sizeof(SrcT) / ONE_BLK_SIZE) * - sizeof(DstT) / sizeof(SrcT); } else { params_.nSize = static_cast(nSize); - params_.isChannelSplit = true; + if constexpr (!MatmulFeatureTrait::IsSupportL0CToUB()) { + params_.isChannelSplit = true; + } } } params_.mSize = static_cast(mSize); @@ -83,6 +65,11 @@ public: if constexpr(EnUnitFlag(MM_CFG)) { params_.unitFlag = FIX_PIPE_UNIT_FLAG; } +#if defined(__DAV_C310__) + if constexpr(C_TYPE::format == CubeFormat::COLUMN_MAJOR) { + params_.params = {1, 0, 0, 1}; + } +#endif } __aicore__ inline void SetQuantMode(QuantMode_t quantMode) @@ -95,13 +82,56 @@ public: params_.deqScalar = scalar; } +#if defined(__DAV_C310__) + __aicore__ inline void SetMcgShfMode(McgShfMode mode) { + params_.dualDstCtl = static_cast(mode); + } + + __aicore__ inline void SetSubBlockId(uint8_t id) { + params_.subBlockId = id; + } +#endif + + __aicore__ inline void SetNdParams(int32_t ndNum, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockWidth, + int32_t baseM, int32_t baseN) + { +#if defined(__DAV_C310__) + params_.params.ndNum = static_cast(ndNum); + params_.params.srcNdStride = static_cast(baseM * baseBlockWidth); + if constexpr ((C_TYPE::layout == LayoutMode::BSNGD) || (C_TYPE::layout == LayoutMode::SBNGD)) { + params_.params.dstNdStride = static_cast(baseWidth); + } else { + params_.params.dstNdStride = static_cast(baseHeight * baseWidth); + } +#else + constexpr static int32_t fractalSize = 1024; + params_.ndNum = static_cast(ndNum); + params_.srcNdStride = static_cast(baseM * baseN * sizeof(SrcT) / fractalSize); + if constexpr ((C_TYPE::layout == LayoutMode::BSNGD) || (C_TYPE::layout == LayoutMode::SBNGD)) { + params_.dstNdStride = static_cast(baseWidth); + } else { + params_.dstNdStride = static_cast(baseHeight * baseWidth); + } +#endif + } + template __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal, const LocalTensor& quantTensor) { if constexpr (C_TYPE::format == CubeFormat::NZ) { - Fixpipe(dst, colLocal, quantTensor, params_); + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && PhyPosIsUB(C_TYPE::pos)) { + Fixpipe(dst, colLocal, quantTensor, params_); + } else { + Fixpipe(dst, colLocal, quantTensor, params_); + } + } else if constexpr (C_TYPE::format == CubeFormat::COLUMN_MAJOR) { + Fixpipe(dst, colLocal, quantTensor, params_); } else { - Fixpipe(dst, colLocal, quantTensor, params_); + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && PhyPosIsUB(C_TYPE::pos)) { + Fixpipe(dst, colLocal, quantTensor, params_); + } else { + Fixpipe(dst, colLocal, quantTensor, params_); + } } } @@ -109,9 +139,19 @@ public: __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal) { if constexpr (C_TYPE::format == CubeFormat::NZ) { - Fixpipe(dst, colLocal, params_); + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && PhyPosIsUB(C_TYPE::pos)) { + Fixpipe(dst, colLocal, params_); + } else { + Fixpipe(dst, colLocal, params_); + } + } else if constexpr (C_TYPE::format == CubeFormat::COLUMN_MAJOR) { + Fixpipe(dst, colLocal, params_); } else { - Fixpipe(dst, colLocal, params_); + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && PhyPosIsUB(C_TYPE::pos)) { + Fixpipe(dst, colLocal, params_); + } else { + Fixpipe(dst, colLocal, params_); + } } } -- Gitee From cb1ca1260238ed25dc264208fe3a48671fde2d9f Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:38:43 +0000 Subject: [PATCH 31/56] add Signed-off-by: jiangchengcheng-on --- .../quant/quant_processor_datacopy.h | 2 +- .../quant/quant_processor_fixpipe.h | 101 +++++++++++++----- .../quant/quant_processor_intf.h | 11 +- .../quant/quant_processor_utils.h | 25 +++-- 4 files changed, 101 insertions(+), 38 deletions(-) diff --git a/impl/matmul/stage/copy_cube_out/quant/quant_processor_datacopy.h b/impl/matmul/stage/copy_cube_out/quant/quant_processor_datacopy.h index d8f40733..5543f0d9 100644 --- a/impl/matmul/stage/copy_cube_out/quant/quant_processor_datacopy.h +++ b/impl/matmul/stage/copy_cube_out/quant/quant_processor_datacopy.h @@ -26,7 +26,7 @@ namespace Impl { namespace Detail { template -class MatmulQuantProcessor::Type, typename C_TYPE::T>() && +class MatmulQuantProcessor() && MatmulFeatureTrait::IsNeedUB())>> { using SrcT = typename A_TYPE::T; diff --git a/impl/matmul/stage/copy_cube_out/quant/quant_processor_fixpipe.h b/impl/matmul/stage/copy_cube_out/quant/quant_processor_fixpipe.h index 56a21584..7eb0c574 100644 --- a/impl/matmul/stage/copy_cube_out/quant/quant_processor_fixpipe.h +++ b/impl/matmul/stage/copy_cube_out/quant/quant_processor_fixpipe.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -26,12 +26,15 @@ namespace Impl { namespace Detail { template -class MatmulQuantProcessor::Type, typename C_TYPE::T>() - && !MatmulFeatureTrait::IsNeedUB())>> +class MatmulQuantProcessor() + && !MatmulFeatureTrait::IsNeedUB() + && !HasScalePosition::value)>> { + MATMUL_USE_MODULE(KLoop); + using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; public: __aicore__ inline MatmulQuantProcessor() {} @@ -50,42 +53,76 @@ public: return quantMode_; } +#if defined(__DAV_C310__) + __aicore__ inline bool IsQuantSenario() + { + return isPerChannel_ || isPerTensor_; + } +#endif + __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) { - if constexpr (IsSameType::value && IsSameType::value) { + isPerChannel_ = true; + quantTensor_ = quantTensor; + + if constexpr (IsSameTypeV && IsSameTypeV) { quantMode_ = QuantMode_t::VDEQF16; - isPerChannel_ = true; - quantTensor_ = quantTensor; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { quantMode_ = QuantMode_t::VREQ8; - isPerChannel_ = true; - quantTensor_ = quantTensor; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { quantMode_ = QuantMode_t::VQF322B8_PRE; - isPerChannel_ = true; - quantTensor_ = quantTensor; } +#if defined(__DAV_C310__) + else if constexpr (IsSameTypeV && IsSameTypeV) { + quantMode_ = QuantMode_t::VQS322BF16_PRE; + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { + quantMode_ = QuantMode_t::VQF322FP8_PRE; + } else if constexpr (IsSameTypeV && IsSameTypeV) { + quantMode_ = QuantMode_t::VQF322HIF8_PRE; + } else if constexpr (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::VQF322F16_PRE; + } else if constexpr (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::VQF322BF16_PRE; + } else if (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::VQF322F32_PRE; + } +#endif } __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) { - if constexpr (IsSameType::value && IsSameType::value) { - quantMode_ = QuantMode_t::DEQF16; - isPerTensor_ = true; - quantScalar_ = quantScalar; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + isPerTensor_ = true; + quantScalar_ = quantScalar; + + if constexpr (IsSameTypeV && IsSameTypeV) { + quantMode_ = QuantMode_t::DEQF16; + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { quantMode_ = QuantMode_t::REQ8; - isPerTensor_ = true; - quantScalar_ = quantScalar; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { quantMode_ = QuantMode_t::QF322B8_PRE; - isPerTensor_ = true; - quantScalar_ = quantScalar; } + +#if defined(__DAV_C310__) + else if constexpr (IsSameTypeV && IsSameTypeV) { + quantMode_ = QuantMode_t::QS322BF16_PRE; + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { + quantMode_ = QuantMode_t::QF322FP8_PRE; + } else if constexpr (IsSameTypeV && IsSameTypeV) { + quantMode_ = QuantMode_t::QF322HIF8_PRE; + } else if constexpr (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::QF322F16_PRE; + } else if constexpr (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::QF322BF16_PRE; + } else if (IsTypeOneOfV && + IsSameTypeV) { + quantMode_ = QuantMode_t::QF322F32_PRE; + } +#endif } __aicore__ inline void CopyQuantTensor(LocalTensor& quantTensor, @@ -93,10 +130,16 @@ public: { if (isPerChannel_) { quantTensor = qidFixPipe_.template AllocTensor(); + int64_t quantTensorGMOffset; + if constexpr (MatmulFeatureTrait::IsSupportL0CToUB() && ToMatmulConfig(MM_CFG).isPartialOutput) { + quantTensorGMOffset = (curN * MATMUL_MODULE(KLoop)->GetInnerIter() + MATMUL_MODULE(KLoop)->GetInnerIdx()) * baseN_; + } else { + quantTensorGMOffset = curN * baseN_; + } if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyDeqTensorToL1(quantTensor, quantTensor_[curN * baseN_], baseUseN); + CopyDeqTensorToL1(quantTensor, quantTensor_[quantTensorGMOffset], baseUseN); } else { - CopyDeqTensorToL1(quantTensor, quantTensor_[curN * baseN_], DivCeil(baseUseN, BLOCK_CUBE) * BLOCK_CUBE); + CopyDeqTensorToL1(quantTensor, quantTensor_[quantTensorGMOffset], DivCeil(baseUseN, BLOCK_CUBE) * BLOCK_CUBE); } qidFixPipe_.EnQue(quantTensor); qidFixPipe_.DeQue(); diff --git a/impl/matmul/stage/copy_cube_out/quant/quant_processor_intf.h b/impl/matmul/stage/copy_cube_out/quant/quant_processor_intf.h index 6000741c..1f045fc2 100644 --- a/impl/matmul/stage/copy_cube_out/quant/quant_processor_intf.h +++ b/impl/matmul/stage/copy_cube_out/quant/quant_processor_intf.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -50,6 +50,15 @@ public: */ __aicore__ inline void SetQuantVector(const GlobalTensor& tensor) {} + /** + * @description: if is quant senario + * @return: quant senario or not + */ + __aicore__ inline bool IsQuantSenario() + { + return false; + } + /** * @description: Get the quant mode * @return: quant mode diff --git a/impl/matmul/stage/copy_cube_out/quant/quant_processor_utils.h b/impl/matmul/stage/copy_cube_out/quant/quant_processor_utils.h index 918896b3..3039d11b 100644 --- a/impl/matmul/stage/copy_cube_out/quant/quant_processor_utils.h +++ b/impl/matmul/stage/copy_cube_out/quant/quant_processor_utils.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -20,16 +20,27 @@ namespace AscendC { namespace Impl { namespace Detail { -template +template __aicore__ inline constexpr static bool IsQuantSenario() { - if constexpr (IsSameType::value && IsSameType::value) { + using L0cT = typename GetMmDstType::Type; +#if defined(__DAV_C310__) + if constexpr (IsTypeOneOfV && + !IsTypeOneOfV) { + return false; + } + if constexpr (IsTypeOneOfV && + IsTypeOneOfV) { + return true; + } + if constexpr (IsSameTypeV && IsSameTypeV) { return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + } +#endif + + if constexpr (IsSameTypeV && IsTypeOneOfV) { return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { + } else if constexpr (IsSameTypeV && IsTypeOneOfV) { return true; } return false; -- Gitee From 10122b11d6d00b4bcd447b6755c906ffe1ddba14 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:41:10 +0000 Subject: [PATCH 32/56] add Signed-off-by: jiangchengcheng-on --- .../stage/split/load_to_l0a/load_to_l0a.h | 1 + .../split/load_to_l0a/load_to_l0a_basic.h | 4 ++- .../split/load_to_l0a/load_to_l0a_gemv.h | 35 ++++++++++++++----- .../split/load_to_l0a/load_to_l0a_intf.h | 9 ++++- .../split/load_to_l0a/load_to_l0a_load2d.h | 12 ++++--- .../split/load_to_l0a/load_to_l0a_loadInstr.h | 15 ++++---- 6 files changed, 55 insertions(+), 21 deletions(-) diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a.h index 0b4f7ab7..2f350f22 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a.h @@ -19,5 +19,6 @@ #include "load_to_l0a_basic.h" #include "load_to_l0a_gemv.h" #include "load_to_l0a_load2d.h" +#include "load_to_l0a_load2dV2.h" #endif // IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0A_H diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_basic.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_basic.h index 2d5688d4..56e32eca 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_basic.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_basic.h @@ -29,6 +29,7 @@ class LoadToL0A::IsSupportLoad2dV2()>> { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() {}; __aicore__ inline ~LoadToL0A() {}; @@ -47,7 +48,8 @@ public: __aicore__ inline void Load(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) const + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const { LoadData3DParamsV2Pro loadData3DV2; if (isATranspose) { diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_gemv.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_gemv.h index bc024cb0..b5452717 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_gemv.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_gemv.h @@ -26,6 +26,7 @@ class LoadToL0A() == GemvMode::SCALAR && !MatmulFeatureTrait::IsNeedUB()>> { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() {}; __aicore__ inline ~LoadToL0A() {}; @@ -45,7 +46,8 @@ public: __aicore__ inline void Load(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) const + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const { ASSERT(madM == 1); InitConstValueParams initConstValueParams {1, (uint16_t)ConstCeil(madK, BLOCK_CUBE * c0Size_), @@ -63,6 +65,7 @@ class LoadToL0A() == GemvMode::VECTOR>> { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() {}; __aicore__ inline ~LoadToL0A() {}; @@ -73,16 +76,32 @@ enable_if_t() == GemvMode::VECTOR>> __aicore__ inline void Load(LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const { - int FracSize = BYTE_PER_FRACTAL / sizeof(A_T); - int repeat = Ceil(madK, FracSize); - LoadData2dParams loadDataParams; - loadDataParams.repeatTimes = repeat; - loadDataParams.srcStride = 1; - LoadData(l0A[0], l1A[aL1KOffset], loadDataParams); + if constexpr (MatmulFeatureTrait::IsSupportLoad2dV2()) { + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = 0; + loadDataParams.kStartPosition = 0; + loadDataParams.mStep = CeilDiv(madM, HW_M0); + loadDataParams.kStep = CeilDiv(madK, c0Size_); + loadDataParams.srcStride = CeilDiv(aL1M, ALIGN_NUM); + loadDataParams.dstStride = CeilDiv(madM, ALIGN_NUM); + loadDataParams.ifTranspose = false; + LoadData(l0A, l1A[aL1KOffset], loadDataParams); + } else { + int FracSize = BYTE_PER_FRACTAL / sizeof(A_T); + int repeat = Ceil(madK, FracSize); + LoadData2dParams loadDataParams; + loadDataParams.repeatTimes = repeat; + loadDataParams.srcStride = 1; + LoadData(l0A[0], l1A[aL1KOffset], loadDataParams); + } return; } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); }; } // namespace Detail diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_intf.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_intf.h index b505f5b8..dea85b98 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_intf.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_intf.h @@ -18,6 +18,8 @@ #include "../load_to_l0_utils.h" +#include "../load_to_l0_utils.h" + namespace AscendC { namespace Impl { namespace Detail { @@ -30,6 +32,7 @@ template class LoadToL0A { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() = default; __aicore__ inline ~LoadToL0A() = default; @@ -61,11 +64,15 @@ public: * @param: aL1MOffset: Offset of the basic block relative to the original aMatrix in the m direction * @param: aL1KOffset: Offset of the basic block relative to the original aMatrix in the k direction * @param: isATranspose: A matrix transpose status + * @param: l1AAuxMatrix: A auxiliary matrix in L1 + * @param: aAuxL1K: the length of K_axis for original auxiliary matrix in L1 + * @param: aAuxL1KOffset: Offset of the basic block relative to the original auxiliary matrix in the k direction * @return: void */ __aicore__ inline void Load(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) const {}; + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const {}; }; } // namespace Detail diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2d.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2d.h index 8ade670a..b16f227d 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2d.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2d.h @@ -28,6 +28,7 @@ class LoadToL0A() == LoadInstrType::LOAD2D)>> { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() {}; __aicore__ inline ~LoadToL0A() {}; @@ -38,7 +39,8 @@ public: __aicore__ inline void Load(const LocalTensor &dst, const LocalTensor &aMatrix, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) const + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const { uint16_t blockUseM = Ceil(madM, BLOCK_CUBE); uint16_t blockUseK = Ceil(madK, c0Size_); @@ -51,16 +53,16 @@ public: } } if (isATranspose) { - TransposeLoad(dst, aMatrix, aL1K, blockUseM, blockUseK, srcL1Offset); + TransLoadDataToL0(dst, aMatrix, aL1K, blockUseM, blockUseK, srcL1Offset); } else { - NoneTransposeLoad(dst, aMatrix, aL1M, isATranspose, blockUseM, blockUseK, srcL1Offset); + LoadDataToL0(dst, aMatrix, aL1M, isATranspose, blockUseM, blockUseK, srcL1Offset); } } private: constexpr static int32_t factor_ = AuxGetFactor(); constexpr static int32_t c0Size_ = AuxGetC0Size(); - __aicore__ inline void TransposeLoad(const LocalTensor &dst, const LocalTensor &aMatrix, + __aicore__ inline void TransLoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, uint16_t aL1K, uint16_t blockUseM, uint16_t blockUseK, int srcL1Offset) const { // startIndex, repeatTimes, srcStride, sid, dstGap, ifTranspose, addrmode @@ -81,7 +83,7 @@ private: } } - __aicore__ inline void NoneTransposeLoad(const LocalTensor &dst, const LocalTensor &aMatrix, + __aicore__ inline void LoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, uint16_t aL1M, bool isATranspose, uint16_t blockUseM, uint16_t blockUseK, int srcL1Offset) const { LoadData2dParams loadDataParams; diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_loadInstr.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_loadInstr.h index 9c0c9eac..9d8474dd 100644 --- a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_loadInstr.h +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_loadInstr.h @@ -25,9 +25,11 @@ template class LoadToL0A() == GemvMode::MATRIX && !(DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) && - MatmulFeatureTrait::IsSupportLoad3dV2()>> + (MatmulFeatureTrait::IsSupportLoad3dV2()) && + !(MatmulFeatureTrait::IsSupportLoad2dV2())>> { using A_T = typename A_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0A() {}; __aicore__ inline ~LoadToL0A() {}; @@ -43,15 +45,16 @@ public: __aicore__ inline void Load(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, - bool isATranspose) const + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const { if constexpr (isFmatrixUpdate_) { SetFmatrix(isATranspose, aL1K, aL1M); } if (isATranspose) { - TransposeLoad(l0A, l1A, aL1M, aL1K, madM, madK, aL1MOffset, aL1KOffset); + TransLoadDataToL0(l0A, l1A, aL1M, aL1K, madM, madK, aL1MOffset, aL1KOffset); } else { - NoneTransposeLoad(l0A, l1A, aL1K, madM, madK, aL1MOffset, aL1KOffset); + LoadDataToL0(l0A, l1A, aL1K, madM, madK, aL1MOffset, aL1KOffset); } } @@ -68,7 +71,7 @@ private: Load3DSetFMatrixCal(1, wAlign, padList); } - __aicore__ inline void TransposeLoad(const LocalTensor &l0A, const LocalTensor &l1A, + __aicore__ inline void TransLoadDataToL0(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset) const { // K_axis is m direction, and M_axis is k direction in load3d intrin @@ -103,7 +106,7 @@ private: } } - __aicore__ inline void NoneTransposeLoad(const LocalTensor &l0A, const LocalTensor &l1A, + __aicore__ inline void LoadDataToL0(const LocalTensor &l0A, const LocalTensor &l1A, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset) const { // format(M, K), K_axis is k direction, and M_axis is m direction in load3d intrin -- Gitee From 1c3b83813e7e498e063ae63244d225beaf2e39f9 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:41:39 +0000 Subject: [PATCH 33/56] add Signed-off-by: jiangchengcheng-on --- .../split/load_to_l0a/load_to_l0a_load2dV2.h | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2dV2.h diff --git a/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2dV2.h b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2dV2.h new file mode 100644 index 00000000..db1700e9 --- /dev/null +++ b/impl/matmul/stage/split/load_to_l0a/load_to_l0a_load2dV2.h @@ -0,0 +1,225 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file load_to_l0a_load2dV2.h + * \brief + */ + +#ifndef IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0A_LOAD2DV2_H +#define IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0A_LOAD2DV2_H + +#include "load_to_l0a_intf.h" +#include "../load_to_l0_utils.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +template +class LoadToL0A() == GemvMode::MATRIX) && + (GetLoadInstrType() == LoadInstrType::LOAD2DV2)>> +{ + using A_T = typename A_TYPE::T; + using L0A_T = typename Conditional::value, typename GetL0DataType::Type, typename GetL0DataType::Type>::type; + using AuxDtype = decltype(GetAuxDataType()); +public: + __aicore__ inline LoadToL0A() {}; + __aicore__ inline ~LoadToL0A() {}; + + __aicore__ inline void Prepare(bool isATranspose, uint16_t aL1K, uint16_t aL1M) const {}; + __aicore__ inline void SetScalar(A_T scalar) {}; + + __aicore__ inline void Load(const LocalTensor &dst, const LocalTensor &aMatrix, + uint16_t aL1M, uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, + bool isATranspose, const LocalTensor &l1AAuxMatrix = {}, uint16_t aAuxL1K = 0, + uint16_t aAuxL1KOffset = 0) const + { + if (isATranspose) { + // Mx should run for MXLoad + if constexpr (!HasScalePosition::value) { + TransLoadDataToL0(dst, aMatrix, aL1K, madM, madK, aL1MOffset, aL1KOffset); + } else { + MxTransLoadDataToL0(dst, aMatrix, aL1K, madM, madK, aL1MOffset, aL1KOffset, l1AAuxMatrix, aAuxL1K, aAuxL1KOffset); + } + } else { + // Mx should run for MXLoad + if constexpr (!HasScalePosition::value) { + LoadDataToL0(dst, aMatrix, aL1M, madM, madK, aL1MOffset, aL1KOffset); + } else { + MxLoadDataToL0(dst, aMatrix, aL1M, madM, madK, aL1MOffset, aL1KOffset, l1AAuxMatrix, aAuxL1K, aAuxL1KOffset); + } + } + } +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline void TransLoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, + uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset) const + { + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = CeilDiv(aL1KOffset, BLOCK_CUBE); + loadDataParams.kStartPosition = CeilDiv(aL1MOffset, c0Size_); + loadDataParams.kStep = CeilDiv(madM, c0Size_); + if constexpr (IsSameType::value) { + // K step must be multiples of 2 when transpose is enabled ane .type = .b32 + loadDataParams.kStep = CeilAlign(loadDataParams.kStep, K_STEP_MIN_VAL_B32); + } + loadDataParams.srcStride = CeilDiv(aL1K, ALIGN_NUM); + loadDataParams.dstStride = CeilDiv(madM, ALIGN_NUM); + loadDataParams.ifTranspose = true; + loadDataParams.mStep = CeilDiv(madK, HW_M0); + if constexpr (IsSuppportB4()) { + // M step must be multiples of 4 when transpose is enabled and .type = .b4 + loadDataParams.mStep = CeilAlign(loadDataParams.mStep, M_STEP_MIN_VAL_B4); + } + + if constexpr (IsSuppportB8()) { + // M step must be multiples of 2 when transpose is enabled and .type = .b8 + uint16_t l0ALoop = CeilAlign(loadDataParams.mStep, M_STEP_MIN_VAL_B8) / M_STEP_MIN_VAL_B8; + uint64_t dstOffset = 0; + uint64_t dstAddrStride = CeilAlign(madM, ALIGN_NUM) * ONE_BLK_SIZE; + loadDataParams.mStep = M_STEP_MIN_VAL_B8; + uint16_t oriMstartPos = loadDataParams.mStartPosition; + // K aixs is m direction, and M aixs is k direction in load2dv2 intrin + for (uint16_t idx = 0; idx < l0ALoop; ++idx) { + loadDataParams.mStartPosition = oriMstartPos + M_STEP_MIN_VAL_B8 * idx; + LoadData(dst[dstOffset], aMatrix, loadDataParams); + dstOffset += dstAddrStride; + } + } else if constexpr (IsSameType::value) { + // in case of mdl && basek=8, the unit of mStartPosition is 16, so don't use it + loadDataParams.mStartPosition = 0; + loadDataParams.kStartPosition = 0; + uint64_t matrixOffset = aL1MOffset * aL1K + aL1KOffset * B32_C0SIZE; + LoadData(dst, aMatrix[matrixOffset], loadDataParams); + } else { + LoadData(dst, aMatrix, loadDataParams); + } + } + + __aicore__ inline void LoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, + uint16_t aL1M, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset) const + { + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = CeilDiv(aL1MOffset, BLOCK_CUBE); + loadDataParams.kStartPosition = CeilDiv(aL1KOffset, c0Size_); + loadDataParams.mStep = CeilDiv(madM, HW_M0); + loadDataParams.kStep = CeilDiv(madK, c0Size_); + loadDataParams.srcStride = CeilDiv(aL1M, ALIGN_NUM); + loadDataParams.dstStride = CeilDiv(madM, ALIGN_NUM); + loadDataParams.ifTranspose = false; + LoadData(dst, aMatrix, loadDataParams); + } + + __aicore__ inline void MxTransLoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, + uint16_t aL1K, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, + const LocalTensor &l1AAuxMatrix, uint16_t aAuxL1K, uint16_t aAuxL1KOffset) const + { +#if defined(__DAV_C310__) + uint16_t mStartPos = CeilDiv(aL1MOffset, ALIGN_NUM); + uint16_t mStep = CeilDiv(madM, HW_M0); + uint16_t kStep = CeilDiv(madK, c0Size_); + uint16_t srcStride = CeilDiv(aL1K, HW_M0); + uint16_t dstStride = CeilDiv(madM, HW_M0); + + uint16_t dataMStartPos = CeilDiv(aL1KOffset, ALIGN_NUM); + uint16_t dataKStartPos = CeilDiv(aL1MOffset, c0Size_); + uint16_t dataMStep = CeilDiv(madK, HW_M0); + uint16_t dataKStep = CeilDiv(madM, c0Size_); + uint16_t dataSrcStride = CeilDiv(aL1K, HW_M0); + uint16_t dataDstStride = CeilDiv(madM, HW_M0); + + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = dataMStartPos; + loadDataParams.kStartPosition = dataKStartPos; + loadDataParams.mStep = dataMStep; + loadDataParams.kStep = dataKStep; + loadDataParams.srcStride = dataSrcStride; + loadDataParams.dstStride = dataDstStride; + loadDataParams.ifTranspose = true; + + LoadData2DMxParams loadDataMxParams; + loadDataMxParams.xStartPosition = mStartPos; + loadDataMxParams.xStep = mStep; + if constexpr (SupportType()) { + uint16_t scaleKStartPos = CeilDiv(aAuxL1KOffset, FP4_TWO); + uint16_t dstScaleStride = CeilDiv(madK, c0Size_); + uint16_t srcScaleStride = CeilDiv(aAuxL1K, FP4_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = kStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = dstScaleStride; + } else if constexpr (SupportType()) { + // for FP8 ,two K0 on the k axis correspond to a small z fractal. + uint16_t scaleKStartPos = CeilDiv(aAuxL1KOffset, FP8_TWO); + uint16_t scaleKStep = CeilDiv(madK, c0Size_* FP8_TWO); + uint16_t srcScaleStride = CeilDiv(aAuxL1K, FP8_TWO); + uint16_t dstScaleStride = CeilDiv(madK, c0Size_* FP8_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = scaleKStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = dstScaleStride; + } + LoadData(dst, aMatrix, l1AAuxMatrix, loadDataParams, loadDataMxParams); +#endif + } + + __aicore__ inline void MxLoadDataToL0(const LocalTensor &dst, const LocalTensor &aMatrix, + uint16_t aL1M, uint16_t madM, uint16_t madK, uint16_t aL1MOffset, uint16_t aL1KOffset, + const LocalTensor &l1AAuxMatrix, uint16_t aAuxL1K, uint16_t aAuxL1KOffset) const + { +#if defined(__DAV_C310__) + uint16_t mStartPos = CeilDiv(aL1MOffset, BLOCK_CUBE); + uint16_t kStartPos = CeilDiv(aL1KOffset, c0Size_); + uint16_t mStep = CeilDiv(madM, HW_M0); + uint16_t kStep = CeilDiv(madK, c0Size_); + uint16_t srcStride = CeilDiv(aL1M, HW_M0); + uint16_t dstStride = CeilDiv(madM, HW_M0); + + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = mStartPos; + loadDataParams.kStartPosition = kStartPos; + loadDataParams.mStep = mStep; + loadDataParams.kStep = kStep; + loadDataParams.srcStride = srcStride; + loadDataParams.dstStride = dstStride; + + LoadData2DMxParams loadDataMxParams; + loadDataMxParams.xStartPosition = mStartPos; + loadDataMxParams.xStep = mStep; + if constexpr (SupportType()) { + uint16_t scaleKStartPos = CeilDiv(aAuxL1KOffset, FP4_TWO); + uint16_t srcScaleStride = CeilDiv(aAuxL1K, FP4_TWO); + uint16_t dstScaleStride = CeilDiv(madK, c0Size_); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = kStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = dstScaleStride; + } else if constexpr (SupportType()) { + // for FP8 ,two K0 on the k axis correspond to a small z fractal. + uint16_t scaleKStartPos = CeilDiv(aAuxL1KOffset, FP8_TWO); + uint16_t scaleKStep = CeilDiv(madK, c0Size_ * FP8_TWO); + uint16_t srcScaleStride = CeilDiv(aAuxL1K, FP8_TWO); + uint16_t dstScaleStride = CeilDiv(madK, c0Size_ * FP8_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = scaleKStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = dstScaleStride; + } + LoadData(dst, aMatrix, l1AAuxMatrix, loadDataParams, loadDataMxParams); +#endif + } +}; + +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0A_LOAD2DV2_H \ No newline at end of file -- Gitee From 5897cffe4f16e221a2a3526d881005e2ed9b1c1c Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:42:41 +0000 Subject: [PATCH 34/56] add Signed-off-by: jiangchengcheng-on --- .../stage/split/load_to_l0b/load_to_l0b.h | 1 + .../split/load_to_l0b/load_to_l0b_basic.h | 4 +++- .../split/load_to_l0b/load_to_l0b_intf.h | 9 ++++++++- .../split/load_to_l0b/load_to_l0b_load2d.h | 4 +++- .../split/load_to_l0b/load_to_l0b_loadInstr.h | 19 +++++++++++-------- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b.h index b24c7228..f8095c4e 100644 --- a/impl/matmul/stage/split/load_to_l0b/load_to_l0b.h +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b.h @@ -18,5 +18,6 @@ #include "load_to_l0b_loadInstr.h" #include "load_to_l0b_basic.h" #include "load_to_l0b_load2d.h" +#include "load_to_l0b_load2dV2.h" #endif // IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0B_H \ No newline at end of file diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_basic.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_basic.h index 920466c1..dc8ba060 100644 --- a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_basic.h +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_basic.h @@ -28,6 +28,7 @@ class LoadToL0B::IsSupportLoad2dV2()>> { using TransT = typename INPUT_TYPE::TRANS_T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0B() {}; __aicore__ inline ~LoadToL0B() {}; @@ -41,7 +42,8 @@ public: __aicore__ inline void Load(const LocalTensor &l0B, const LocalTensor &l1B, uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, - bool isBTranspose, const LocalTensor &l1BIndexMatrix = NULL_TENSOR) const + bool isBTranspose, const LocalTensor &l1BAuxMatrix = NULL_TENSOR, uint16_t bAuxL1K = 0, + uint16_t bAuxL1KOffset = 0) const { constexpr uint16_t typeSize = sizeof(TransT); if (isBTranspose) { diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_intf.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_intf.h index 9ee8db54..c7397a5f 100644 --- a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_intf.h +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_intf.h @@ -18,6 +18,8 @@ #include "../load_to_l0_utils.h" +#include "../load_to_l0_utils.h" + namespace AscendC { namespace Impl { namespace Detail { @@ -30,6 +32,7 @@ template class LoadToL0B { using TransT = typename INPUT_TYPE::TRANS_T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0B() = default; @@ -54,11 +57,15 @@ public: * @param: bL1NOffset: Offset of the basic block relative to the original bMatrix in the n direction * @param: bL1KOffset: Offset of the basic block relative to the original bMatrix in the k direction * @param: isBTranspose: B matrix transpose status + * @param: l1BAuxMatrix: B auxiliary matrix in L1 + * @param: bAuxL1K: the length of K_axis for original auxiliary matrix in L1 + * @param: bAuxL1KOffset: Offset of the basic block relative to the original auxiliary matrix in the k direction * @return: void */ __aicore__ inline void Load(const LocalTensor &l0B, const LocalTensor &l1B, uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, - bool isBTranspose, const LocalTensor &l1BIndexMatrix = {}) const {}; + bool isBTranspose, const LocalTensor &l1BAuxMatrix = {}, uint16_t bAuxL1K = 0, + uint16_t bAuxL1KOffset = 0) const {}; }; } // namespace Detail diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2d.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2d.h index fbe6b17e..976170c0 100644 --- a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2d.h +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2d.h @@ -27,6 +27,7 @@ class LoadToL0B() == LoadInstrType::LOAD2D)>> { using B_T = typename INPUT_TYPE::T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0B() {}; __aicore__ inline ~LoadToL0B() {}; @@ -35,7 +36,8 @@ public: __aicore__ inline void Load(const LocalTensor &dst, const LocalTensor &bMatrix, uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, - bool isBTranspose, const LocalTensor &l1BIndexMatrix = {}) const + bool isBTranspose, const LocalTensor &l1BAuxMatrix = {}, uint16_t bAuxL1K = 0, + uint16_t bAuxL1KOffset = 0) const { uint16_t blockUseN = Ceil(madN, BLOCK_CUBE); uint16_t blockUseK = Ceil(madK, c0Size_); diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_loadInstr.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_loadInstr.h index cfc3d057..37dd8b5f 100644 --- a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_loadInstr.h +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_loadInstr.h @@ -24,10 +24,12 @@ namespace Detail { template class LoadToL0B::IsSupportLoad3dV2()>> + (MatmulFeatureTrait::IsSupportLoad3dV2()) && + !(MatmulFeatureTrait::IsSupportLoad2dV2())>> { using B_T = typename INPUT_TYPE::T; using TransT = typename INPUT_TYPE::TRANS_T; + using AuxDtype = decltype(GetAuxDataType()); public: __aicore__ inline LoadToL0B() {}; __aicore__ inline ~LoadToL0B() {}; @@ -41,16 +43,17 @@ public: __aicore__ inline void Load(const LocalTensor &l0B, const LocalTensor &l1B, uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, - bool isBTranspose, const LocalTensor &l1BIndexMatrix = {}) const + bool isBTranspose, const LocalTensor &l1BAuxMatrix = {}, uint16_t bAuxL1K = 0, + uint16_t bAuxL1KOffset = 0) const { if constexpr (MatmulFeatureTrait::IsNeedUB() && IsSameType::value && IsSameType::value) { isBTranspose = true; } if (isBTranspose) { - TransposeLoad(l0B, l1B, bL1N, madN, madK, bL1NOffset, bL1KOffset, l1BIndexMatrix); + TransLoadDataToL0(l0B, l1B, bL1N, madN, madK, bL1NOffset, bL1KOffset, l1BAuxMatrix); } else { - NoneTransposeLoad(l0B, l1B, bL1N, bL1K, madN, madK, bL1NOffset, bL1KOffset, isBTranspose); + LoadDataToL0(l0B, l1B, bL1N, bL1K, madN, madK, bL1NOffset, bL1KOffset, isBTranspose); } } @@ -74,8 +77,8 @@ private: } } - __aicore__ inline void TransposeLoad(const LocalTensor &l0B, const LocalTensor &l1B, - uint16_t bL1N, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, const LocalTensor &l1BIndexMatrix = {}) const + __aicore__ inline void TransLoadDataToL0(const LocalTensor &l0B, const LocalTensor &l1B, + uint16_t bL1N, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, const LocalTensor &l1BAuxMatrix = {}) const { // SET LOAD2D parameters , loop axis: K or M, or 1 if constexpr (HasSparseIndex()) { @@ -120,7 +123,7 @@ private: uint64_t l0bOffset = 0; for (uint64_t i = 0; i < l0bLoop; i++) { if constexpr (HasSparseIndex()) { - LoadDataWithSparse(l0B[l0bOffset], l1B[l1bOffset], l1BIndexMatrix[l1bOffset >> INDEX_SHIFT], loadDataParams); + LoadDataWithSparse(l0B[l0bOffset], l1B[l1bOffset], l1BAuxMatrix[l1bOffset >> INDEX_SHIFT], loadDataParams); } else { LoadData(l0B[l0bOffset], l1B[l1bOffset], loadDataParams); } @@ -129,7 +132,7 @@ private: } } - __aicore__ inline void NoneTransposeLoad(const LocalTensor &l0B, const LocalTensor &l1B, + __aicore__ inline void LoadDataToL0(const LocalTensor &l0B, const LocalTensor &l1B, uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, bool isBTranspose) const { -- Gitee From ff4deb7e9b612e788be020635838f66203df6af5 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:43:10 +0000 Subject: [PATCH 35/56] add Signed-off-by: jiangchengcheng-on --- .../split/load_to_l0b/load_to_l0b_load2dV2.h | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2dV2.h diff --git a/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2dV2.h b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2dV2.h new file mode 100644 index 00000000..13ba6799 --- /dev/null +++ b/impl/matmul/stage/split/load_to_l0b/load_to_l0b_load2dV2.h @@ -0,0 +1,222 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file load_to_l0b_load2dV2.h + * \brief + */ + +#ifndef IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0B_LOAD2DV2_H +#define IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0B_LOAD2DV2_H + +#include "load_to_l0b_intf.h" +#include "../load_to_l0_utils.h" + +namespace AscendC { +namespace Impl { +namespace Detail { +template +class LoadToL0B() == GemvMode::MATRIX) && + (GetLoadInstrType() == LoadInstrType::LOAD2DV2)>> +{ + using B_T = typename INPUT_TYPE::T; + using L0B_T = typename Conditional::value, typename GetL0DataType::Type, typename GetL0DataType::Type>::type; + using AuxDtype = decltype(GetAuxDataType()); +public: + __aicore__ inline LoadToL0B() {}; + __aicore__ inline ~LoadToL0B() {}; + + __aicore__ inline void Load(const LocalTensor &dst, const LocalTensor &bMatrix, + uint16_t bL1N, uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, + bool isBTranspose, const LocalTensor &l1BAuxMatrix = {}, uint16_t bAuxL1K = 0, + uint16_t bAuxL1KOffset = 0) const + { + if (isBTranspose) { + // Mx should run for MXLoad + if constexpr (!HasScalePosition::value) { + TransLoadDataToL0(dst, bMatrix, bL1N, madN, madK, bL1NOffset, bL1KOffset); + } else { + MxTransLoadDataToL0(dst, bMatrix, bL1N, madN, madK, bL1NOffset, bL1KOffset, l1BAuxMatrix, bAuxL1K, bAuxL1KOffset); + } + } else { + // Mx should run for MXLoad + if constexpr (!HasScalePosition::value) { + LoadDataToL0(dst, bMatrix, bL1K, madN, madK, bL1NOffset, bL1KOffset); + } else { + MxLoadDataToL0(dst, bMatrix, bL1K, madN, madK, bL1NOffset, bL1KOffset, l1BAuxMatrix, bAuxL1K, bAuxL1KOffset); + } + } + } + + __aicore__ inline void Prepare(bool isBTranspose, uint16_t bL1K) const {}; +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline void TransLoadDataToL0(const LocalTensor &dst, const LocalTensor &bMatrix, + uint16_t bL1N, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset) const + { + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = CeilDiv(bL1NOffset, BLOCK_CUBE); + loadDataParams.kStartPosition = CeilDiv(bL1KOffset, c0Size_); + loadDataParams.mStep = CeilDiv(madN, HW_M0); + loadDataParams.kStep = CeilDiv(madK, c0Size_); + loadDataParams.srcStride = CeilDiv(bL1N, ALIGN_NUM); + loadDataParams.dstStride = CeilDiv(madN, ALIGN_NUM); + loadDataParams.ifTranspose = false; + LoadData(dst, bMatrix, loadDataParams); + } + + __aicore__ inline void LoadDataToL0(const LocalTensor &dst, const LocalTensor &bMatrix, + uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset) const + { + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = CeilDiv(bL1KOffset, BLOCK_CUBE); + loadDataParams.kStartPosition = CeilDiv(bL1NOffset, c0Size_); + loadDataParams.kStep = CeilDiv(madN, c0Size_); + if constexpr (IsSameType::value) { + // K step must be multiples of 2 when transpose is enabled ane .type = .b32 + loadDataParams.kStep = CeilAlign(loadDataParams.kStep, K_STEP_MIN_VAL_B32); + } + loadDataParams.srcStride = CeilDiv(bL1K, ALIGN_NUM); + loadDataParams.dstStride = CeilDiv(madN, ALIGN_NUM); + loadDataParams.ifTranspose = true; + loadDataParams.mStep = CeilDiv(madK, HW_M0); + + if constexpr (IsSuppportB4()) { + // M step must be multiples of 4 when transpose is enabled and .type = .b4 + loadDataParams.mStep = CeilAlign(loadDataParams.mStep, M_STEP_MIN_VAL_B4); + } + + if constexpr (IsSuppportB8()) { + // M step must be multiples of 2 when transpose is enabled and .type = .b8 + uint16_t l0BLoop = CeilAlign(loadDataParams.mStep, M_STEP_MIN_VAL_B8) / M_STEP_MIN_VAL_B8; + uint64_t dstOffset = 0; + uint64_t dstAddrStride = CeilAlign(madN, ALIGN_NUM) * ONE_BLK_SIZE; + loadDataParams.mStep = M_STEP_MIN_VAL_B8; + uint16_t oriMstartPos = loadDataParams.mStartPosition; + // K aixs is m direction, and M aixs is k direction in load2dv2 intrin + for (uint16_t idx = 0; idx < l0BLoop; ++idx) { + loadDataParams.mStartPosition = oriMstartPos + M_STEP_MIN_VAL_B8 * idx; + LoadData(dst[dstOffset], bMatrix, loadDataParams); + dstOffset += dstAddrStride; + } + } else if constexpr (IsSameType::value) { + // in case of mdl && basek=8, the unit of mStartPosition is 16, so don't use it + loadDataParams.mStartPosition = 0; + loadDataParams.kStartPosition = 0; + uint64_t matrixOffset = bL1NOffset * bL1K + bL1KOffset * B32_C0SIZE; + LoadData(dst, bMatrix[matrixOffset], loadDataParams); + } else { + LoadData(dst, bMatrix, loadDataParams); + } + } + + __aicore__ inline void MxTransLoadDataToL0(const LocalTensor &dst, const LocalTensor &bMatrix, + uint16_t bL1N, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, + const LocalTensor &l1BAuxMatrix, uint16_t bAuxL1K, uint16_t bAuxL1KOffset) const + { +#if defined(__DAV_C310__) + uint16_t mStartPos = CeilDiv(bL1NOffset, BLOCK_CUBE); + uint16_t kStartPos = CeilDiv(bL1KOffset, c0Size_); + uint8_t mStep = CeilDiv(madN, HW_M0); + uint8_t kStep = CeilDiv(madK, c0Size_); + uint16_t srcStride = CeilDiv(bL1N, HW_M0); + uint16_t dstStride = CeilDiv(madN, HW_M0); + + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = mStartPos; + loadDataParams.kStartPosition = kStartPos; + loadDataParams.mStep = mStep; + loadDataParams.kStep = kStep; + loadDataParams.srcStride = srcStride; + loadDataParams.dstStride = dstStride; + + LoadData2DMxParams loadDataMxParams; + loadDataMxParams.xStartPosition = mStartPos; + loadDataMxParams.xStep = mStep; + if constexpr (SupportType()) { + uint16_t scaleKStartPos = CeilDiv(bAuxL1KOffset, FP4_TWO); + uint16_t srcScaleStride = CeilDiv(bAuxL1K, FP4_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = kStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = kStep; + } else if constexpr (SupportType()) { + // for FP8 ,two K0 on the k axis correspond to a small z fractal. + uint16_t scaleKStartPos = CeilDiv(bAuxL1KOffset, FP8_TWO); + uint8_t scaleKStep = CeilDiv(madK, c0Size_* FP8_TWO); + uint16_t srcScaleStride = CeilDiv(bAuxL1K, FP8_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = scaleKStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = scaleKStep; + } + LoadData(dst, bMatrix, l1BAuxMatrix, loadDataParams, loadDataMxParams); +#endif + } + + __aicore__ inline void MxLoadDataToL0(const LocalTensor &dst, const LocalTensor &bMatrix, + uint16_t bL1K, uint16_t madN, uint16_t madK, uint16_t bL1NOffset, uint16_t bL1KOffset, + const LocalTensor &l1BAuxMatrix, uint16_t bAuxL1K, uint16_t bAuxL1KOffset) const + { +#if defined(__DAV_C310__) + uint16_t mStartPos = CeilDiv(bL1NOffset, BLOCK_CUBE); + uint8_t mStep = CeilDiv(madN, HW_M0); + uint8_t kStep = CeilDiv(madK, c0Size_); + uint16_t srcStride = CeilDiv(bL1K, HW_M0); + uint16_t dstStride = CeilDiv(madN, HW_M0); + + uint16_t dataMStartPos = CeilDiv(bL1KOffset, BLOCK_CUBE); + uint16_t dataKStartPos = CeilDiv(bL1NOffset, c0Size_); + uint8_t dataMStep = CeilDiv(madK, HW_M0); + uint8_t dataKStep = CeilDiv(madN, c0Size_); + uint16_t dataSrcStride = CeilDiv(bL1K, HW_M0); + uint16_t dataDstStride = CeilDiv(madN, HW_M0); + + LoadData2DParamsV2 loadDataParams; + loadDataParams.mStartPosition = dataMStartPos; + loadDataParams.kStartPosition = dataKStartPos; + loadDataParams.mStep = dataMStep; + loadDataParams.kStep = dataKStep; + loadDataParams.srcStride = dataSrcStride; + loadDataParams.dstStride = dataDstStride; + + loadDataParams.ifTranspose = true; + + LoadData2DMxParams loadDataMxParams; + loadDataMxParams.xStartPosition = mStartPos; + loadDataMxParams.xStep = mStep; + if constexpr (SupportType()) { + uint16_t scaleKStartPos = CeilDiv(bAuxL1KOffset, FP4_TWO); + uint16_t srcScaleStride = CeilDiv(bAuxL1K, FP4_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = kStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = kStep; + } else if constexpr (SupportType()) { + // for FP8 ,two K0 on the k axis correspond to a small z fractal. + uint16_t scaleKStartPos = CeilDiv(bAuxL1KOffset, FP8_TWO); + uint8_t scaleKStep = CeilDiv(madK, c0Size_* FP8_TWO); + uint16_t srcScaleStride = CeilDiv(bAuxL1K, FP8_TWO); + loadDataMxParams.yStartPosition = scaleKStartPos; + loadDataMxParams.yStep = scaleKStep; + loadDataMxParams.srcStride = srcScaleStride; + loadDataMxParams.dstStride = scaleKStep; + } + LoadData(dst, bMatrix, l1BAuxMatrix, loadDataParams, loadDataMxParams); +#endif + } +}; + +} // namespace Detail +} // namespace Impl +} // namespace AscendC +#endif // IMPL_MATMUL_STAGE_SPLIT_LOAD_TO_L0B_LOAD2DV2_H \ No newline at end of file -- Gitee From d980b3ab2ec877fce03c485ff0c1abb08e1943f8 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:45:21 +0000 Subject: [PATCH 36/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/stage/split/load_to_l0_utils.h | 18 +++ .../tiling/matmul_constant_tiling_impl.h | 134 +++++++++++++++--- .../tiling/matmul_constant_tiling_struct.h | 22 +++ .../tiling/matmul_constant_tiling_utils.h | 81 ++++++++--- 4 files changed, 216 insertions(+), 39 deletions(-) diff --git a/impl/matmul/stage/split/load_to_l0_utils.h b/impl/matmul/stage/split/load_to_l0_utils.h index 3402ff0c..e2ebb702 100644 --- a/impl/matmul/stage/split/load_to_l0_utils.h +++ b/impl/matmul/stage/split/load_to_l0_utils.h @@ -28,6 +28,9 @@ constexpr uint64_t M_POS_BIT = 48; constexpr uint64_t K_POS_BIT = 32; constexpr uint64_t M_STEP_BIT = 16; constexpr uint8_t INDEX_SHIFT = 2; +constexpr uint8_t M_STEP_MIN_VAL_B8 = 2; +constexpr uint8_t M_STEP_MIN_VAL_B4 = 4; +constexpr uint8_t K_STEP_MIN_VAL_B32 = 2; constexpr uint8_t padList[4] = {0, 0, 0, 0}; enum class LoadInstrType { @@ -94,6 +97,21 @@ __aicore__ inline constexpr LoadL0bInstrType GetLoadL0bInstrType() return LoadL0bInstrType::LOAD2D; } +template +__aicore__ inline constexpr auto GetAuxDataType() +{ + // Mx auxData is fp8, sparse auxData is uint8_t + if constexpr (HasSparseIndex()) { + uint8_t auxData = 0; + return auxData; + } else if constexpr (HasScalePosition::value) { + float8_e8m0_t mxType = 0; + return mxType; + } else { + uint8_t defaultData = 0; + return defaultData; + } +} } // namespace Detail } // namespace Impl } // namespace AscendC diff --git a/impl/matmul/tiling/matmul_constant_tiling_impl.h b/impl/matmul/tiling/matmul_constant_tiling_impl.h index 3195ffc7..e3ff4fdc 100644 --- a/impl/matmul/tiling/matmul_constant_tiling_impl.h +++ b/impl/matmul/tiling/matmul_constant_tiling_impl.h @@ -33,7 +33,8 @@ __aicore__ constexpr L1Status GetL1StatusBothFullLoad(const MatmulConfig &mmCFG, int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); if (GetL1Size(l1Status, mmCFG) <= l1Size) { - int32_t loadSize = (PhyPosIsL1(A_TYPE::pos) ? 0 : m) + (PhyPosIsL1(B_TYPE::pos) ? 0 : n); + int32_t loadSize = (PhyPosIsL1(A_TYPE::pos) ? 0 : m) + + (PhyPosIsL1(B_TYPE::pos) ? 0 : n); return {kAL1, kBL1, GetMaxMAL1(mmCFG), GetMaxNBL1(mmCFG), 1, 1, loadSize}; } return {0, 0, 0, 0, 0, 0, INT32_MAX}; @@ -58,23 +59,26 @@ __aicore__ constexpr L1Status GetL1StatusAL1FullLoad(const MatmulConfig &mmCFG, } int32_t kaAlignValue = GetKAAlignValue(); int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); - int32_t aL1Size = MaxValue(Align(k, kaAlignValue), Align(kAL1, kaAlignValue)) * - MaxValue(maxMAL1 * mmCFG.basicM, m * Impl::HW_C0) * Impl::C0_BYTE_SIZE; + // be consistent with initbuffer + int32_t aL1Size = MaxValue(maxMAL1 * mmCFG.basicM, m * Impl::HW_C0) * + CeilNoLog(k, kL0) * Align(mmCFG.basicK, static_cast(kaAlignValue * reduceC0Size)) * + GetBitSize() / ONE_BYTE_BIT_SIZE; int32_t bL1Size = PhyPosIsL1(A_TYPE::pos) ? l1Size : l1Size - aL1Size; l1Status.dbBL1 = Impl::DB_ON; - l1Status.dbBL1 = GetL1Size(l1Status, mmCFG) > l1Size ? - Impl::DB_OFF : l1Status.dbBL1; + if (GetL1Size(l1Status, mmCFG) > l1Size) { + l1Status.dbBL1 = Impl::DB_OFF; + } int32_t biasSize = GetBiasL1Size(l1Status, mmCFG); int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG); int32_t kbAlignValue = GetKBAlignValue(); - l1Status.kBL1 = MinValue(CalcL1MaxLen((bL1Size - biasSize - dequantSize), + l1Status.kBL1 = MinValue(CalcL1MaxLen((bL1Size - biasSize - dequantSize), l1Status, mmCFG, kbAlignValue, L1TilingType::KBL1_16), k); int32_t bL1Times = MinValue(l1Status.kBL1 / kL0, GetMaxKBL1(mmCFG)); int32_t aL1Times = CeilNoLog(k, kL0); bL1Times = GetNearestFactor(aL1Times, bL1Times); l1Status.kBL1 = bL1Times * kL0; if (l1Status.kBL1 == k) { - l1Status.nBL1 = MinValue(CalcL1MaxLen(bL1Size, l1Status, mmCFG, + l1Status.nBL1 = MinValue(CalcL1MaxLen(bL1Size, l1Status, mmCFG, kbAlignValue, L1TilingType::N_BL1), GetMaxNBL1(mmCFG)); int32_t nRepeat = CeilNoLog(mmCFG.singleCoreN, mmCFG.basicN); l1Status.nBL1 = GetNearestFactor(nRepeat, l1Status.nBL1); @@ -109,21 +113,25 @@ __aicore__ constexpr L1Status GetL1StatusBL1FullLoad(const MatmulConfig &mmCFG, } int32_t kbAlignValue = GetKBAlignValue(); int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); - int32_t bL1Size = MaxValue(Align(k, kbAlignValue), Align(kBL1, kbAlignValue)) * - MaxValue(maxNBL1 * mmCFG.basicN, n * Impl::HW_C0) * Impl::C0_BYTE_SIZE; + // be consistent with initbuffer + int32_t bL1Size = MaxValue(maxNBL1 * mmCFG.basicN, n * Impl::HW_C0) * + CeilNoLog(k, kL0) * Align(mmCFG.basicK, static_cast(kbAlignValue * reduceC0Size)) * + GetBitSize() / ONE_BYTE_BIT_SIZE; int32_t aL1Size = PhyPosIsL1(B_TYPE::pos) ? l1Size : l1Size - bL1Size; l1Status.dbAL1 = Impl::DB_ON; - l1Status.dbAL1 = GetL1Size(l1Status, mmCFG) > l1Size ? - Impl::DB_OFF : l1Status.dbAL1; + if (GetL1Size(l1Status, mmCFG) > l1Size) { + l1Status.dbAL1 = Impl::DB_OFF; + } int32_t biasSize = GetBiasL1Size(l1Status, mmCFG); int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG); int32_t kaAlignValue = GetKAAlignValue(); - l1Status.kAL1 = MinValue(CalcL1MaxLen((aL1Size - biasSize - dequantSize), + l1Status.kAL1 = MinValue(CalcL1MaxLen((aL1Size - biasSize - dequantSize), l1Status, mmCFG, kaAlignValue, L1TilingType::KAL1_16), k); int32_t aL1Times = MinValue(l1Status.kAL1 / kL0, GetMaxKAL1(mmCFG)); + aL1Times = GetNearestFactor(CeilNoLog(k, kL0), aL1Times); l1Status.kAL1 = aL1Times * kL0; if (l1Status.kAL1 == k) { - l1Status.mAL1 = MinValue(CalcL1MaxLen(aL1Size - biasSize, l1Status, mmCFG, + l1Status.mAL1 = MinValue(CalcL1MaxLen(aL1Size - biasSize, l1Status, mmCFG, kaAlignValue, L1TilingType::M_AL1), GetMaxMAL1(mmCFG)); int32_t mRepeat = CeilNoLog(mmCFG.singleCoreM, mmCFG.basicM); l1Status.mAL1 = GetNearestFactor(mRepeat, l1Status.mAL1); @@ -151,12 +159,12 @@ __aicore__ constexpr L1Status GetL1StatusMFirst(const L1Status &l1Status, const int32_t kbAlignValue = GetKBAlignValue(); int32_t biasSize = GetBiasL1Size(l1MFirst, mmCFG); int32_t dequantSize = GetDeQuantSize(l1MFirst, mmCFG); - l1MFirst.mAL1 = MaxValue(MinValue(CalcL1MaxLen(aL1Size - biasSize - dequantSize, + l1MFirst.mAL1 = MaxValue(MinValue(CalcL1MaxLen(aL1Size - biasSize - dequantSize, l1MFirst, mmCFG, kaAlignValue, L1TilingType::M_AL1), GetMaxMAL1(mmCFG), mRepeat), 1); l1MFirst.mAL1 = GetNearestFactor(mRepeat, l1MFirst.mAL1); aL1Size = GetAL1Size(l1MFirst, mmCFG); bL1Size = l1Size - aL1Size; - l1MFirst.nBL1 = MaxValue(MinValue(CalcL1MaxLen(bL1Size - biasSize - dequantSize, + l1MFirst.nBL1 = MaxValue(MinValue(CalcL1MaxLen(bL1Size - biasSize - dequantSize, l1MFirst, mmCFG, kbAlignValue, L1TilingType::N_BL1), GetMaxNBL1(mmCFG), nRepeat), 1); l1MFirst.nBL1 = GetNearestFactor(mRepeat, l1MFirst.nBL1); int32_t mL0 = GetML0(mmCFG); @@ -177,13 +185,13 @@ __aicore__ constexpr L1Status GetL1StatusNFirst(const L1Status &l1Status, const int32_t kbAlignValue = GetKBAlignValue(); int32_t biasSize = GetBiasL1Size(l1NFirst, mmCFG); int32_t dequantSize = GetDeQuantSize(l1NFirst, mmCFG); - l1NFirst.nBL1 = MaxValue(MinValue(CalcL1MaxLen(bL1Size - biasSize - dequantSize, + l1NFirst.nBL1 = MaxValue(MinValue(CalcL1MaxLen(bL1Size - biasSize - dequantSize, l1Status, mmCFG, kbAlignValue, L1TilingType::N_BL1), GetMaxNBL1(mmCFG), nRepeat), 1); l1NFirst.nBL1 = GetNearestFactor(nRepeat, l1NFirst.nBL1); bL1Size = GetBL1Size(l1NFirst, mmCFG); aL1Size = l1Size - bL1Size; int32_t kaAlignValue = GetKAAlignValue(); - l1NFirst.mAL1 = MaxValue(MinValue(CalcL1MaxLen(aL1Size - biasSize - dequantSize, + l1NFirst.mAL1 = MaxValue(MinValue(CalcL1MaxLen(aL1Size - biasSize - dequantSize, l1NFirst, mmCFG, kaAlignValue, L1TilingType::M_AL1), GetMaxMAL1(mmCFG), mRepeat), 1); l1NFirst.mAL1 = GetNearestFactor(mRepeat, l1NFirst.mAL1); l1NFirst.nBL1 = GetNearestFactor(mRepeat, l1NFirst.nBL1); @@ -253,9 +261,10 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoadForNZ(const L1Status &l1Nz, if (GetL1Size(l1Status, mmCFG) <= l1Size) { int32_t bL1Size = GetBL1Size(l1Status, mmCFG); int32_t aL1Size = l1Size - bL1Size; - l1Status.kAL1 = MinValue(CalcL1MaxLen((aL1Size - biasSize - dequantSize), l1Status, + l1Status.kAL1 = MinValue(CalcL1MaxLen((aL1Size - biasSize - dequantSize), l1Status, mmCFG, kaAlignValue, L1TilingType::KAL1_16), k); aL1Times = MaxValue(MinValue(l1Status.kAL1 / kL0, maxMAL1), 1); + aL1Times = GetNearestFactor(CeilNoLog(k, kL0), aL1Times); l1Status.kAL1 = aL1Times * kL0; } else { // when NeitherFullLoadMN change the nBL1 and mAL1 @@ -269,7 +278,8 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoadForNZ(const L1Status &l1Nz, GetTypeSize(); int32_t bL1 = l1Status.nBL1 * mmCFG.basicN * bAlignedPerK * l1Status.dbBL1 * GetTypeSize(); - if (IsSameTypeV && (aL1 + bL1 + dequantSize + biasSize) > l1Size) { + if (IsSameTypeV && + (aL1 + bL1 + dequantSize + biasSize) > l1Size) { perK -= 1; } int32_t perTimes = MinValue(perK / kL0, MaxValue(GetMaxKAL1(mmCFG), @@ -307,13 +317,14 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db, l1Status.kAL1 = MinValue((aL1Size - biasSize - dequantSize) / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE), k); aL1Times = MaxValue(l1Status.kAL1 / kL0, 1); + aL1Times = GetNearestFactor(CeilNoLog(k, kL0), aL1Times); l1Status.kAL1 = aL1Times * kL0; aL1Size = l1Status.kAL1 * l1Status.mAL1 * mmCFG.basicM * Impl::C0_BYTE_SIZE * l1Status.dbAL1; bL1Size = l1Size - aL1Size; l1Status.kBL1 = MinValue((bL1Size - dequantSize - biasSize) / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * mmCFG.basicK * kL0 * GetBitSize() / ONE_BYTE_BIT_SIZE), k); int32_t bL1Times = MaxValue(MinValue(l1Status.kBL1 / kL0, GetMaxKBL1(mmCFG)), 1); - bL1Times = GetNearestFactor(aL1Times, bL1Times); + bL1Times = GetNearestFactor(CeilNoLog(k, kL0), bL1Times); l1Status.kBL1 = bL1Times * kL0; } else if (kMaxAxis == 2) { // first get k_bl1, second get k_al1 @@ -330,6 +341,7 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db, l1Status.kAL1 = MinValue((aL1Size - dequantSize - biasSize) / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * mmCFG.basicK * kL0 * GetBitSize() / ONE_BYTE_BIT_SIZE), k); aL1Times = MaxValue(MinValue(l1Status.kAL1 / kL0, GetMaxKAL1(mmCFG)), 1); + aL1Times = GetNearestFactor(CeilNoLog(k, kL0), aL1Times); l1Status.kAL1 = aL1Times * kL0; } return l1Status; @@ -347,19 +359,25 @@ __aicore__ constexpr L1Status GetL1StatusNeitherFullLoad(const MatmulConfig &mmC l1Status = GetKL1NeitherFullLoad(l1Status, mmCFG, l1Size, biasSize, dequantSize); if (l1Status.kAL1 > l1Status.kBL1 && l1Status.kAL1 % l1Status.kBL1 != 0) { - while (l1Status.kAL1 % l1Status.kBL1 != 0 || (l1Status.kAL1 != l1Status.kBL1 && k % l1Status.kAL1 != 0)) { + while (l1Status.kAL1 % l1Status.kBL1 != 0 || + (l1Status.kAL1 != l1Status.kBL1 && k % l1Status.kAL1 != 0)) { l1Status.kAL1 -= 1; } } if (l1Status.kAL1 < l1Status.kBL1 && l1Status.kBL1 % l1Status.kAL1 != 0) { - while (l1Status.kBL1 % l1Status.kAL1 != 0 || (l1Status.kAL1 != l1Status.kBL1 && k % l1Status.kBL1 != 0)) { + while (l1Status.kBL1 % l1Status.kAL1 != 0 || + (l1Status.kAL1 != l1Status.kBL1 && k % l1Status.kBL1 != 0)) { l1Status.kBL1 -= 1; } } auto l1MFirst = GetL1StatusMFirst(l1Status, mmCFG, l1Size); auto l1NFirst = GetL1StatusNFirst(l1Status, mmCFG, l1Size); if (l1Status.kAL1 >= k && l1Status.kBL1 >= k) { - l1Status = l1NFirst.loadSize > l1MFirst.loadSize ? l1MFirst : l1NFirst; + if (l1NFirst.loadSize > l1MFirst.loadSize) { + l1Status = l1MFirst; + } else { + l1Status = l1NFirst; + } } if (l1Status.kAL1 >= k && l1Status.kBL1 < k) { l1Status.nBL1 = 1; @@ -459,5 +477,75 @@ __aicore__ constexpr bool CalcAL1FullLoadTiling(int32_t l1Size, MatmulApiStaticT tiling.stepKb = stepKb; return true; } + +template +__aicore__ constexpr MxScaleStatus GetMxScaleFactor(const MatmulApiStaticTiling &tiling, int32_t l1Size) +{ + using SrcAT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using SrcBiasT = typename BIAS_TYPE::T; + MxScaleStatus mxScaleFactor{ 1, 1, 0 }; + + int dataUsedL1Size = tiling.depthA1 * tiling.baseM * tiling.baseK * GetBitSize() / ONE_BYTE_BIT_SIZE + + tiling.depthB1 * tiling.baseN * tiling.baseK * GetBitSize() / ONE_BYTE_BIT_SIZE; + int bias = tiling.isBias ? 1 : 0; + int biasUsedL1Size = bias * tiling.baseN * GetBitSize() / ONE_BYTE_BIT_SIZE; + int remainedL1Size = l1Size - (dataUsedL1Size + biasUsedL1Size); + int kStep = CeilNoLog(tiling.singleCoreK, tiling.baseK); + int maxScaleFactorA = CeilNoLog(kStep, tiling.stepKa); + int maxScaleFactorB = CeilNoLog(kStep, tiling.stepKb); + + mxScaleFactor.scaleFactorA = + remainedL1Size / Impl::MX_L1_BUFFER_NUM / (tiling.stepKa * tiling.baseM * tiling.baseK / Impl::SCALE_K_SIZE); + mxScaleFactor.scaleFactorB = + remainedL1Size / Impl::MX_L1_BUFFER_NUM / (tiling.stepKb * tiling.baseN * tiling.baseK / Impl::SCALE_K_SIZE); + mxScaleFactor.scaleFactorA = + mxScaleFactor.scaleFactorA > maxScaleFactorA ? maxScaleFactorA : mxScaleFactor.scaleFactorA; + mxScaleFactor.scaleFactorB = + mxScaleFactor.scaleFactorB > maxScaleFactorB ? maxScaleFactorB : mxScaleFactor.scaleFactorB; + + // scaleFactor is in range of [1, 127] + mxScaleFactor.scaleFactorA = mxScaleFactor.scaleFactorA >= 1 ? mxScaleFactor.scaleFactorA : 1; + mxScaleFactor.scaleFactorB = mxScaleFactor.scaleFactorB >= 1 ? mxScaleFactor.scaleFactorB : 1; + mxScaleFactor.scaleFactorA = + mxScaleFactor.scaleFactorA <= Impl::SCALE_FACTOR_MAX_VALUE ? mxScaleFactor.scaleFactorA : Impl::SCALE_FACTOR_MAX_VALUE; + mxScaleFactor.scaleFactorB = + mxScaleFactor.scaleFactorB <= Impl::SCALE_FACTOR_MAX_VALUE ? mxScaleFactor.scaleFactorB : Impl::SCALE_FACTOR_MAX_VALUE; + + // 8bit: 0~6bit:scaleFactor, 7bit(reserved):double buffer flag + mxScaleFactor.scaleFactorA = mxScaleFactor.scaleFactorA & 0x7f; + mxScaleFactor.scaleFactorB = mxScaleFactor.scaleFactorB & 0x7f; + mxScaleFactor.mxTypePara = static_cast(static_cast(mxScaleFactor.mxTypePara) | mxScaleFactor.scaleFactorA); + mxScaleFactor.mxTypePara = static_cast(static_cast(mxScaleFactor.mxTypePara) | (mxScaleFactor.scaleFactorB << 8U)); + return mxScaleFactor; +} + +template +__aicore__ constexpr void GetMxMatmulApiTiling(MatmulApiStaticTiling &tiling, int32_t l1Size) +{ +#if defined(__DAV_C310__) + if constexpr (HasScalePosition::value || HasScalePosition::value) { + MxScaleStatus mxScaleFactor = GetMxScaleFactor(tiling, l1Size); + tiling.mxTypePara = mxScaleFactor.mxTypePara; + // For MxMatmul : usedL1Size = tensorASize + scaleASize + tensorBSize + scaleBSize + biasSize + int32_t needUsedL1Size = tiling.baseM * tiling.baseK * tiling.depthA1 * GetBitSize() / + ONE_BYTE_BIT_SIZE + tiling.baseM * tiling.baseK / Impl::SCALE_K_SIZE * tiling.depthA1 * mxScaleFactor.scaleFactorA * 1 + + tiling.baseN * tiling.baseK * tiling.depthB1 * GetBitSize() / + ONE_BYTE_BIT_SIZE + tiling.baseN * tiling.baseK / Impl::SCALE_K_SIZE * tiling.depthB1 * mxScaleFactor.scaleFactorB * 1 + + int32_t(tiling.isBias) * tiling.baseN * GetBitSize() / ONE_BYTE_BIT_SIZE; + if (needUsedL1Size > l1Size) { + tiling.stepM = 1; + tiling.stepN = 1; + tiling.stepKa = 1; + tiling.stepKb = 1; + tiling.depthA1 = 1; + tiling.depthB1 = 1; + tiling.dbL0A = 1; + tiling.dbL0B = 1; + tiling.mxTypePara = Impl::MIN_MX_PARAM; // scaleFactorA = 1, scaleFactorB = 1 + } + } +#endif +} } // namespace AscendC #endif // _MATMUL_CONSTANT_TILING_IMPL_ \ No newline at end of file diff --git a/impl/matmul/tiling/matmul_constant_tiling_struct.h b/impl/matmul/tiling/matmul_constant_tiling_struct.h index 287680ee..dc27b081 100644 --- a/impl/matmul/tiling/matmul_constant_tiling_struct.h +++ b/impl/matmul/tiling/matmul_constant_tiling_struct.h @@ -399,6 +399,28 @@ struct MatmulTiling { return tiling_->BatchNum; } } + + __aicore__ inline uint8_t GetScaleFactorA() const + { + constexpr static uint64_t scaleFactorAMask = 0x0000007f; + if constexpr (IsSameTypeV) { + return (TILING.mxTypePara != -1) ? + (TILING.mxTypePara & scaleFactorAMask) : tiling_->mxTypePara & scaleFactorAMask; + } else { + return tiling_->mxTypePara & scaleFactorAMask; + } + } + __aicore__ inline uint8_t GetScaleFactorB() const + { + constexpr static uint64_t scaleFactorBMask = 0x00007f00; + constexpr static uint32_t scaleFactorBOffset = 8; + if constexpr (IsSameTypeV) { + return (TILING.mxTypePara != -1) ? ((TILING.mxTypePara & scaleFactorBMask) >> scaleFactorBOffset) : + ((tiling_->mxTypePara & scaleFactorBMask) >> scaleFactorBOffset); + } else { + return (tiling_->mxTypePara & scaleFactorBMask) >> scaleFactorBOffset ; + } + } private: const TCubeTiling* __restrict tiling_; }; diff --git a/impl/matmul/tiling/matmul_constant_tiling_utils.h b/impl/matmul/tiling/matmul_constant_tiling_utils.h index 01166a86..a1f4e64f 100644 --- a/impl/matmul/tiling/matmul_constant_tiling_utils.h +++ b/impl/matmul/tiling/matmul_constant_tiling_utils.h @@ -32,11 +32,14 @@ constexpr int32_t BITS_PER_BYTE = 8; constexpr int32_t L1_SIZE = 1024 * 1024; #elif __CCE_AICORE__ == 300 constexpr int32_t L1_SIZE = 1024 * 1024; -#elif __CCE_AICORE__ == 310 -constexpr int32_t L1_SIZE = 512 * 1024; #else constexpr int32_t L1_SIZE = 512 * 1024; #endif + +constexpr int32_t SCALE_K_SIZE = 32; +constexpr int32_t MIN_MX_PARAM = 257;// scaleFactorA = 1, scaleFactorB = 1 +constexpr int32_t MX_L1_BUFFER_NUM = 4;// A/B/scaleA/scaleB buffer +constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127;// scaleFactorA/scaleFactorB is 7 bit, max vaule is 127 } enum class L1TilingType : uint8_t { @@ -56,6 +59,12 @@ struct L1Status { int32_t loadSize; }; +struct MxScaleStatus { + uint8_t scaleFactorA; + uint8_t scaleFactorB; + int32_t mxTypePara; +}; + template __aicore__ constexpr int32_t GetReduceC0Size() { @@ -126,7 +135,7 @@ __aicore__ constexpr int32_t GetKAAlignValue() using SrcAT = typename A_TYPE::T; if constexpr (sizeof(SrcAT) == sizeof(float)) { // when in FP32 mode, k_a must be an even number if k-alignment is needed. So make ka_align_value as 2. - return A_TYPE::isTrans ? 2 : 1; + return (A_TYPE::isTrans) ? 2 : 1; } return 1; } @@ -137,7 +146,7 @@ __aicore__ constexpr int32_t GetKBAlignValue() using SrcBT = typename B_TYPE::T; if constexpr (sizeof(SrcBT) == sizeof(float)) { // Same as previous one, make kb_align_value as 2 when k-alignment is needed - return (A_TYPE::isTrans || !B_TYPE::isTrans) ? 2 : 1; + return (!B_TYPE::isTrans) ? 2 : 1; } return 1; } @@ -181,14 +190,15 @@ __aicore__ constexpr int32_t GetDeQuantSize(const L1Status &l1Status, const Matm template __aicore__ constexpr int32_t GetAL1Size(const L1Status &l1Status, const MatmulConfig &mmCFG) { - using SrcAT = typename A_TYPE::T; int32_t curA1Size = 0; int32_t kL0 = GetKL0(mmCFG); if constexpr (PhyPosIsL1(A_TYPE::pos)) { curA1Size = 0; } else { - curA1Size = l1Status.mAL1 * mmCFG.basicM * CeilNoLog(l1Status.kAL1, kL0) * mmCFG.basicK * - l1Status.dbAL1 * GetBitSize() / ONE_BYTE_BIT_SIZE; + // be consistent with initbuffer + curA1Size = l1Status.dbAL1 * l1Status.mAL1 * mmCFG.basicM * CeilNoLog(l1Status.kAL1, kL0) * + Align(mmCFG.basicK, static_cast(GetKAAlignValue() * GetReduceC0Size())) * + GetBitSize() / ONE_BYTE_BIT_SIZE; } return curA1Size; } @@ -202,8 +212,10 @@ __aicore__ constexpr int32_t GetBL1Size(const L1Status &l1Status, const MatmulCo if constexpr (PhyPosIsL1(B_TYPE::pos)) { curB1Size = 0; } else { - curB1Size = l1Status.nBL1 * CeilNoLog(l1Status.kBL1, kL0) * l1Status.dbBL1 * mmCFG.basicN * - mmCFG.basicK * GetBitSize() / ONE_BYTE_BIT_SIZE; + // be consistent with initbuffer + curB1Size = l1Status.dbBL1 * l1Status.nBL1 * mmCFG.basicN * CeilNoLog(l1Status.kBL1, kL0) * + Align(mmCFG.basicK, static_cast(GetKBAlignValue() * GetReduceC0Size()))* + GetBitSize() / ONE_BYTE_BIT_SIZE; } return curB1Size; } @@ -219,19 +231,22 @@ __aicore__ constexpr int32_t GetL1Size(const L1Status &l1Status, const MatmulCon return curA1Size + curB1Size + biasSize + dequantSize; } -template +template __aicore__ constexpr int32_t CalcL1MaxLen(int32_t l1LeftSize, const L1Status &l1Status, const MatmulConfig &mmCFG, int32_t alignValue, L1TilingType type) { + // be consistent with initbuffer int32_t maxLen = 1; switch (type) { case L1TilingType::KAL1_16: - maxLen = l1LeftSize / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE); - maxLen = AlignDown(maxLen, alignValue); + maxLen = l1LeftSize / (l1Status.dbAL1 * l1Status.mAL1 * mmCFG.basicM * + Align(mmCFG.basicK, static_cast(alignValue * GetReduceC0Size())) * + GetBitSize() / ONE_BYTE_BIT_SIZE) * GetKL0(mmCFG); break; case L1TilingType::KBL1_16: - maxLen = l1LeftSize / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE); - maxLen = AlignDown(maxLen, alignValue); + maxLen = l1LeftSize / (l1Status.dbBL1 * l1Status.nBL1 * mmCFG.basicN * + Align(mmCFG.basicK, static_cast(alignValue * GetReduceC0Size())) * + GetBitSize() / ONE_BYTE_BIT_SIZE) * GetKL0(mmCFG); break; case L1TilingType::M_AL1: maxLen = l1LeftSize / (Align(l1Status.kAL1, alignValue) * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE); @@ -295,14 +310,20 @@ __aicore__ constexpr int32_t GetIterateOrder(const L1Status &l1Status, const Mat } } +template __aicore__ constexpr int32_t GetL0ADb(const MatmulConfig &mmCFG, uint32_t l0ASize) { - return (mmCFG.basicM * Impl::C0_BYTE_SIZE > l0ASize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON; + using SrcAT = typename A_TYPE::T; + return (mmCFG.basicM * mmCFG.basicK * GetBitSize() / ONE_BYTE_BIT_SIZE > l0ASize / Impl::DB_ON) ? + Impl::DB_OFF : Impl::DB_ON; } +template __aicore__ constexpr int32_t GetL0BDb(const MatmulConfig &mmCFG, uint32_t l0BSize) { - return (mmCFG.basicN * Impl::C0_BYTE_SIZE > l0BSize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON; + using SrcBT = typename B_TYPE::T; + return (mmCFG.basicN * mmCFG.basicK * GetBitSize() / ONE_BYTE_BIT_SIZE > l0BSize / Impl::DB_ON) ? + Impl::DB_OFF : Impl::DB_ON; } template @@ -335,6 +356,34 @@ __aicore__ constexpr int32_t GetL1UsedSize(const MatmulConfig &mmCFG, const L1St return sharedl1Size; } +template +__aicore__ constexpr int32_t GetL1UsedSize(const MatmulConfig &mmCFG, int32_t depthA1, int32_t depthB1) +{ + int32_t sharedl1Size = 0; + if constexpr (!PhyPosIsL1(A_TYPE::pos)) { + sharedl1Size += depthA1 * mmCFG.basicM * mmCFG.basicK * GetBitSize() / ONE_BYTE_BIT_SIZE; + } + if constexpr (!PhyPosIsL1(B_TYPE::pos)) { + if constexpr (IsSameTypeV) { + sharedl1Size += depthB1 * mmCFG.basicN * mmCFG.basicK * + GetBitSize() / ONE_BYTE_BIT_SIZE; + } else { + // A16W8 w8 use same with A_TYPE + sharedl1Size += depthB1 * mmCFG.basicN * mmCFG.basicK * + GetBitSize() / ONE_BYTE_BIT_SIZE; + } + } + if (mmCFG.enableSetBias) { + if constexpr (!PhyPosIsL1(BIAS_TYPE::pos)) { + sharedl1Size += mmCFG.basicN * GetBitSize() / ONE_BYTE_BIT_SIZE; + } + } + if (mmCFG.enableQuantVector) { + sharedl1Size += depthB1 * mmCFG.basicN * sizeof(uint64_t); + } + return sharedl1Size; +} + template __aicore__ constexpr int32_t GetTransLength(const MatmulConfig &mmCFG, const L1Status &l1Status) { -- Gitee From ecb973c5170eceb23d05d885f27f9b080d81112f Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 03:45:45 +0000 Subject: [PATCH 37/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/tiling/matmul_tiling_algorithm.h | 12 ++++++--- impl/matmul/tiling/matmul_tiling_base.cpp | 27 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm.h b/impl/matmul/tiling/matmul_tiling_algorithm.h index 3a668fbd..bb5323c2 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm.h +++ b/impl/matmul/tiling/matmul_tiling_algorithm.h @@ -368,7 +368,13 @@ private: void GetL0Factors(const std::string& opType, const MatmulRunParas& param, const CoreStatusPack& coreStatus, SingleCoreStatus& singleCoreStatus) const; void AdjustSparseL0Factors(SingleCoreStatus& singleCoreStatus) const; - bool IsNeedAlign(bool IsA) const; + void AdjustMxL0Factors(SingleCoreStatus& singleCoreStatus) const; + void AdjustMxL1Factors(SingleCoreStatus& singleCoreStatus, const int32_t k0Size) const; + void GetMxScaleFactor(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& mxTypePara) const; + void CheckL0DB(SingleCoreStatus& singleCoreStatus, const int32_t baseK) const; + void GetMxUsedL1Size(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, + int32_t& dataUsedL1Size, int32_t& scaleUsedL1Size, int32_t& biasUsedL1Size) const; + bool IsNeedAlign(bool isA) const; int32_t GetL1Size(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const; int32_t CalL1MaxLen(int32_t resL1Size, L1StatusPack& l1Status, const L0StatusPack& l0Status, const int32_t alignValue, const L1TilingType axisName) const; @@ -450,7 +456,7 @@ private: bool DoMultiCoreSplitMNTiling(const MatmulRunParas& params, CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes); void CalcL1Tiling(const ComputeBaseBlock &baseBlock, int32_t &depthA1, int32_t &depthB1, - int32_t &stepKa, int32_t &stepKb) const; + int32_t &stepKa, int32_t &stepKb); void UpdateStepK(const ComputeBaseBlock &baseBlock, int32_t &stepK) const; bool NeedOutputAlign(int32_t m, int32_t n, int32_t k) const; void UpdateUsedSize() const; @@ -460,7 +466,7 @@ private: int32_t stepKa, int32_t stepKb) const; void UpdateShapeAndLayout() const; void AdjustFloatL1Factor(const SingleCoreStatus& singleCoreStatus) const; - int64_t UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) const; + int64_t UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus); private: MatmulApiTilingBase* tilingIns_ = nullptr; bool singelBlockDim_ = false; diff --git a/impl/matmul/tiling/matmul_tiling_base.cpp b/impl/matmul/tiling/matmul_tiling_base.cpp index c4011e1a..07a79d6d 100644 --- a/impl/matmul/tiling/matmul_tiling_base.cpp +++ b/impl/matmul/tiling/matmul_tiling_base.cpp @@ -283,6 +283,30 @@ int32_t MatmulApiTilingBase::SetBType(TPosition pos, CubeFormat type, DataType d return 0; } +int32_t MatmulApiTilingBase::SetScaleAType(TPosition scalePos, CubeFormat scaleType, bool isScaleTrans) +{ + TILING_LOG_DEBUG(" A scale TPosition: %d", static_cast(scalePos)); + TILING_LOG_DEBUG(" A scale Type: %d", static_cast(scaleType)); + TILING_LOG_DEBUG(" A scale isTrans: %d", static_cast(isScaleTrans)); + aType_.hasSetScaleType = true; + aType_.scalePos = scalePos; + aType_.scaleType = scaleType; + aType_.isScaleTrans = isScaleTrans; + return 0; +} + +int32_t MatmulApiTilingBase::SetScaleBType(TPosition scalePos, CubeFormat scaleType, bool isScaleTrans) +{ + TILING_LOG_DEBUG(" B scale TPosition: %d", static_cast(scalePos)); + TILING_LOG_DEBUG(" B scale Type: %d", static_cast(scaleType)); + TILING_LOG_DEBUG(" B scale isTrans: %d", static_cast(isScaleTrans)); + bType_.hasSetScaleType = true; + bType_.scalePos = scalePos; + bType_.scaleType = scaleType; + bType_.isScaleTrans = isScaleTrans; + return 0; +} + int32_t MatmulApiTilingBase::SetCType(TPosition pos, CubeFormat type, DataType dataType) { TILING_LOG_DEBUG(" C matrix TPosition: %d", static_cast(pos)); @@ -401,6 +425,7 @@ int32_t MatmulApiTilingBase::SetBatchInfoForNormal(int32_t batchA, int32_t batch this->cLayoutInfoN = 1; this->cLayoutInfoG = 1; this->cLayoutInfoS2 = n; + this->isBMNKBmm = true; return 0; } @@ -712,6 +737,7 @@ void MatmulApiTilingBase::SetFinalTiling(optiling::TCubeTiling& tiling) tiling.set_CLayoutInfoG(this->tiling_.get_CLayoutInfoG()); tiling.set_CLayoutInfoS2(this->tiling_.get_CLayoutInfoS2()); tiling.set_BatchNum(this->tiling_.get_BatchNum()); + tiling.set_mxTypePara(this->tiling_.get_mxTypePara()); return; } @@ -800,6 +826,7 @@ void MatmulApiTilingBase::PrintTilingData() std::cout << "tiling.CLayoutInfoG = " << this->tiling_.get_CLayoutInfoG() << std::endl; std::cout << "tiling.CLayoutInfoS2 = " << this->tiling_.get_CLayoutInfoS2() << std::endl; std::cout << "tiling.BatchNum = " << this->tiling_.get_BatchNum() << std::endl; + std::cout << "tiling.mxTypePara = " << this->tiling_.get_mxTypePara() << std::endl; std::cout << "tiling.L1Ratio = " << (this->tiling_.get_shareL1Size() + 0.0) / this->oriBufferPool_.l1Size << std::endl; -- Gitee From 78582f366afbb85848c50b03e366a5d9c0d00f93 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 06:09:14 +0000 Subject: [PATCH 38/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/utils/matmul_config_impl.h | 3 + impl/matmul/utils/matmul_config_utils.h | 16 ++ impl/matmul/utils/matmul_module.h | 4 +- impl/matmul/utils/matmul_param.h | 79 ++++++- impl/matmul/utils/matmul_type_def.h | 36 +++- impl/matmul/utils/matmul_utils.h | 272 ++++++++++++++++++++++-- 6 files changed, 375 insertions(+), 35 deletions(-) diff --git a/impl/matmul/utils/matmul_config_impl.h b/impl/matmul/utils/matmul_config_impl.h index 66493636..ebd2215f 100644 --- a/impl/matmul/utils/matmul_config_impl.h +++ b/impl/matmul/utils/matmul_config_impl.h @@ -33,6 +33,7 @@ __aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, ArgType arg) cfg.isNBatch = arg.isNBatch; cfg.batchMode = arg.batchMode; cfg.isBiasBatch = arg.isBiasBatch; + cfg.isNBatchOut = arg.isNBatchOut; } else if constexpr (AscendC::IsSameType::value) { cfg.intrinsicsCheck = arg.intrinsicsCheck; cfg.enVecND2NZ = arg.enVecND2NZ; @@ -46,6 +47,8 @@ __aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, ArgType arg) cfg.isA2B2Shared = arg.isA2B2Shared; cfg.isEnableChannelSplit = arg.isEnableChannelSplit; cfg.enableKdimReorderLoad = arg.enableKdimReorderLoad; + } else if constexpr (AscendC::IsSameType::value) { + cfg.enableSetBias = arg.enableSetBias; } } diff --git a/impl/matmul/utils/matmul_config_utils.h b/impl/matmul/utils/matmul_config_utils.h index 4b155868..91eb4744 100644 --- a/impl/matmul/utils/matmul_config_utils.h +++ b/impl/matmul/utils/matmul_config_utils.h @@ -20,6 +20,7 @@ namespace AscendC { namespace Impl { #define L0AUF_SIZE 65536 #define L0BUF_SIZE 65536 + constexpr int32_t QUEUE_DEPTH = 1; constexpr int32_t NZ_MASK_VAlUE = 2; constexpr int32_t FLOAT_FACTOR = 2; @@ -51,6 +52,9 @@ constexpr static int UBSize_ = 256 * 1024; #elif __CCE_AICORE__ == 300 constexpr static int L1Size_ = 1024 * 1024; constexpr static int L0CSize_ = 128 * 1024; +#elif __CCE_AICORE__ == 310 +constexpr static int L1Size_ = 512 * 1024; +constexpr static int L0CSize_ = 256 * 1024; #else constexpr static int L1Size_ = 512 * 1024; constexpr static int L0CSize_ = 128 * 1024; @@ -58,6 +62,11 @@ constexpr static int L0CSize_ = 128 * 1024; constexpr static int L0ASize_ = 64 * 1024; constexpr static int L0BSize_ = 64 * 1024; +constexpr int32_t MX_K_FACTOR = 32; +constexpr int32_t MX_BASEK_FACTOR = 64; +constexpr int32_t FP8_TWO = 2; +constexpr int32_t FP4_TWO = 2; + /* the KFC_MESSAGE_LENGTH is 64 the MAX_MSG_COUNT is 64 @@ -72,6 +81,13 @@ constexpr static int L0BSize_ = 64 * 1024; equal: TOTAL_UB_SIZE * MAX_AIV_NUM */ constexpr int64_t GM_OFFSET = 128 * 2 * 64 * 50 + 128 * 8 * 50 + 192 * 1024 * 50; +#if defined(__DAV_C310__) +constexpr FixpipeConfig CFG_ROW_MAJOR_UB = {CO2Layout::ROW_MAJOR, true}; +constexpr FixpipeConfig CFG_NZ_UB = {CO2Layout::NZ, true}; +#else +constexpr FixpipeConfig CFG_ROW_MAJOR_UB = {CO2Layout::ROW_MAJOR}; +constexpr FixpipeConfig CFG_NZ_UB = {CO2Layout::NZ}; +#endif } // namespace Impl __aicore__ constexpr bool DoMatmulNorm(MatmulConfig mmCFG) diff --git a/impl/matmul/utils/matmul_module.h b/impl/matmul/utils/matmul_module.h index d11addea..7b937d24 100644 --- a/impl/matmul/utils/matmul_module.h +++ b/impl/matmul/utils/matmul_module.h @@ -156,7 +156,7 @@ using NAME = typename MATMUL_MODULE_IN_POLICY(template NAME<__VA_ARGS__>) /* Matmul Private Module */ #define MATMUL_PRIVATE_TEMPLATE AscendC::Impl::Detail::MatmulPrivateModules #define MATMUL_MODULE_IN_PRIVATE(...) \ -MATMUL_PRIVATE_TEMPLATE::__VA_ARGS__ +MATMUL_PRIVATE_TEMPLATE::__VA_ARGS__ #define MATMUL_IMPORT_MODULE_PRIVATE(...) protected MATMUL_MODULE_IN_PRIVATE(__VA_ARGS__) @@ -175,4 +175,4 @@ using NAME = typename MATMUL_MODULE_IN_PRIVATE(template NAME<__VA_ARGS__>) #define MATMUL_PARAM_VAR MATMUL_CONTEXT() #define MATMUL_CONST_PARAM_VAR MATMUL_CONST_CONTEXT() -#endif // _MATMUL_MODULE_H_ \ No newline at end of file +#endif // _MATMUL_MODULE_H_ diff --git a/impl/matmul/utils/matmul_param.h b/impl/matmul/utils/matmul_param.h index fb5baf3d..2f2bd89c 100644 --- a/impl/matmul/utils/matmul_param.h +++ b/impl/matmul/utils/matmul_param.h @@ -37,7 +37,7 @@ struct MatmulParamsBase { template struct MatmulParamsNorm : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; __aicore__ inline MatmulParamsNorm() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -98,7 +98,7 @@ template || IsBasicBlockEnable || IsIntrablock>> : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; __aicore__ inline MatmulParamsNorm() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -117,7 +117,7 @@ struct MatmulParamsNorm struct MatmulParamsMDL : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; __aicore__ inline MatmulParamsMDL() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -136,7 +136,7 @@ struct MatmulParamsMDL : public MatmulParamsBase struct MatmulParamsMDLSparse : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; __aicore__ inline MatmulParamsMDLSparse() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -153,6 +153,46 @@ struct MatmulParamsMDLSparse : public MatmulParamsBase +struct MatmulParamsMxNorm : public MatmulParamsBase { + using L0cT = typename GetMmDstType::Type; + __aicore__ inline MatmulParamsMxNorm() {}; + using SrcT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + DEBUG_CODE(int calCount_ = 0); + + TPipe* tpipe_; + + bool enHF32Mode_; + int32_t hf32TransMode_; + + int baseMN_; +}; + +template +struct MatmulParamsMxMDL : public MatmulParamsBase { + using L0cT = typename GetMmDstType::Type; + __aicore__ inline MatmulParamsMxMDL() {}; + using SrcT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + DEBUG_CODE(int calCount_ = 0); + + TPipe* tpipe_; + + bool enHF32Mode_; + int32_t hf32TransMode_; + + int baseMN_; +}; +#endif + template struct MatmulParamsBasicBlock : public MatmulParamsNorm { __aicore__ inline MatmulParamsBasicBlock() {}; @@ -160,7 +200,7 @@ struct MatmulParamsBasicBlock : public MatmulParamsNorm struct MatmulParamsIBShareNorm : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; __aicore__ inline MatmulParamsIBShareNorm() {}; using SrcT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -192,15 +232,16 @@ struct MatmulParams { // CFG_NORM template -struct MatmulParams { - __aicore__ inline MatmulParams(){}; +struct MatmulParams::value && !HasScalePosition::value>> { + __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsNorm; }; // CFG_MDL template struct MatmulParams()>> { + enable_if_t() && (!HasScalePosition::value && !HasScalePosition::value)>> { __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsMDL; }; @@ -208,11 +249,29 @@ struct MatmulParams struct MatmulParams() && DoMatmulMDL(MM_CFG)>> { + enable_if_t() && DoMatmulMDL(MM_CFG) && (!HasScalePosition::value && !HasScalePosition::value)>> { __aicore__ inline MatmulParams() {}; using PARAMS = MatmulParamsMDLSparse; }; +#if defined(__DAV_C310__) +// MX_CFG_NORM +template +struct MatmulParams::value && HasScalePosition::value>> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsMxNorm; +}; + +// MX_CFG_MDL +template +struct MatmulParams::value && HasScalePosition::value>> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsMxMDL; +}; +#endif + // MM_CFG_BB template struct MatmulParams { @@ -230,4 +289,4 @@ struct MatmulParams + LayoutMode LAYOUT = LayoutMode::NONE, bool IBSHARE = false, TPosition SRCPOS = TPosition::GM> struct MatmulType { constexpr static TPosition pos = POSITION; constexpr static CubeFormat format = FORMAT; @@ -34,6 +36,7 @@ struct MatmulType { constexpr static bool isTrans = ISTRANS; constexpr static LayoutMode layout = LAYOUT; constexpr static bool ibShare = IBSHARE; + constexpr static TPosition srcPos = SRCPOS; }; template @@ -48,6 +51,18 @@ struct MatmulInputBType : INPUT_TYPE { constexpr static InputTypeTag TAG = InputTypeTag::B; }; +template +struct MatmulInputScaleAType : INPUT_TYPE { + using TRANS_T = TRANS_TYPE; + constexpr static InputTypeTag TAG = InputTypeTag::scaleA; +}; + +template +struct MatmulInputScaleBType : INPUT_TYPE { + using TRANS_T = TRANS_TYPE; + constexpr static InputTypeTag TAG = InputTypeTag::scaleB; +}; + template struct MatmulInputCType : INPUT_TYPE { using TRANS_T = TRANS_TYPE; @@ -63,7 +78,6 @@ struct ConstantType constexpr __aicore__ inline operator value_type() const noexcept {return value;} }; - typedef ConstantType falseType; typedef ConstantType trueType; @@ -81,5 +95,23 @@ struct SparseMatmulType: public MatmulType +struct HasScalePosition : falseType {}; + +template +struct HasScalePosition> : trueType {}; + +template +struct MatmulTypeWithScale : public MatmulType { + constexpr static TPosition scalePosition = SCALE_POSITION; + constexpr static CubeFormat scaleFormat = SCALE_FORMT; + constexpr static bool isScaleTrans = SCALE_ISTRANS; + constexpr static TPosition srcScalePos = SCALE_SRCPOS; +}; + +template +constexpr bool isMxMatmul = HasScalePosition::value && HasScalePosition::value; + } // namespace AscendC #endif // _MATMUL_TYPE_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/utils/matmul_utils.h b/impl/matmul/utils/matmul_utils.h index bc0e45ee..c8485eeb 100644 --- a/impl/matmul/utils/matmul_utils.h +++ b/impl/matmul/utils/matmul_utils.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * Copyright (c) 2024-2025 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -18,10 +18,35 @@ #include "matmul_config_utils.h" #include "matmul_type_def.h" +#include "../feature_trait/matmul_feature_trait.h" namespace AscendC { + +constexpr uint32_t L0C_BUF_BLOCK_LEN = 32 * 1024; +constexpr uint32_t L0C_BUF_BLOCK_NUM = 8; constexpr int8_t N_BUFFER_33_FACTOR = 3; +constexpr TQueConfig gCO1Config = { + .nd2nz = false, + .nz2nd = false, + .scmBlockGroup = false, + .bufferLen = L0C_BUF_BLOCK_LEN, + .bufferNumber = L0C_BUF_BLOCK_NUM +}; + +using gCO1QueType = TQue; +__BLOCK_LOCAL__ __inline__ void* gCO1Que; + +// kfc<->impl +struct MatrixL1Addr +{ + uint64_t l1aAddr; + uint64_t l1bAddr; + uint64_t l1aScaleAddr; + uint64_t l1bScaleAddr; + uint64_t l1biasAddr; +}; + struct DataCopyOutParams { __aicore__ DataCopyOutParams() { @@ -71,6 +96,12 @@ struct SplitParams int16_t axisL0Len; }; +struct MxSplitParams : public SplitParams +{ + int16_t kAuxMatrixL1Len; + int16_t kAuxMatrixL1Offset; +}; + struct BatchOffsetInfo { int32_t modA; @@ -129,38 +160,81 @@ struct CopyGMParams { bool isComputeLineByLine { false }; }; -template +template const LocalTensor NULL_TENSOR; template const GlobalTensor GLOBAL_NULL_TENSOR; -template struct GetDstType { +template struct GetMmDstType { + using Type = T; +}; + +template +struct GetL0DataType { using Type = T; }; -template <> struct GetDstType { +template <> struct GetMmDstType { using Type = float; }; -template <> struct GetDstType { +template <> struct GetMmDstType { using Type = float; }; -template <> struct GetDstType { +template <> struct GetMmDstType { using Type = int32_t; }; #if __CCE_AICORE__ >= 220 -template <> struct GetDstType { +template <> struct GetMmDstType { using Type = float; }; -template <> struct GetDstType { +template <> struct GetMmDstType { using Type = int32_t; }; #endif +#if defined(__DAV_C310__) +template <> struct GetMmDstType { + using Type = float; +}; + +template <> struct GetMmDstType { + using Type = float; +}; + +template <> struct GetMmDstType { + using Type = float; +}; + +template <> struct GetMmDstType { + using Type = float; +}; + +template <> struct GetMmDstType { + using Type = float; +}; + +template <> struct GetL0DataType { + using Type = AscendC::mx_fp8_e5m2_t; +}; + +template <> struct GetL0DataType { + using Type = float8_e5m2_t; +}; + +template <> struct GetL0DataType { + using Type = AscendC::mx_fp8_e4m3_t; +}; + +template <> struct GetL0DataType { + using Type = float8_e4m3_t; +}; +#endif + template struct IsGlobalTensor : falseType {}; @@ -194,14 +268,52 @@ __aicore__ inline constexpr static int32_t AuxGetC0Size() { if (sizeof(SrcT) == sizeof(float)) { return Impl::B32_C0SIZE; - } else if (IsSameType::value) { + } +#if defined(__DAV_C310__) + else if (IsTypeOneOfV) + { + return Impl::B8_C0SIZE; + } else if (IsTypeOneOfV) { + return Impl::B4_C0SIZE; + } +#else + else if (IsSameType::value) { return Impl::B8_C0SIZE; } else if (IsSameType::value) { return Impl::B4_C0SIZE; } +#endif return Impl::B16_C0SIZE; } +template +__aicore__ inline constexpr bool IsSuppportB8() +{ + if (IsSameTypeV) { + return true; + } +#if defined(__DAV_C310__) + if (IsTypeOneOfV) { + return true; + } +#endif + return false; +} + +template +__aicore__ inline constexpr bool IsSuppportB4() +{ + if (IsSameTypeV) { + return true; + } +#if defined(__DAV_C310__) + if (IsTypeOneOfV) { + return true; + } +#endif + return false; +} + template __aicore__ inline T CeilT(T num1, T num2) { @@ -218,7 +330,7 @@ __aicore__ inline T CeilAlignT(T num1, T num2) return CeilT(num1, num2) * num2; } -#if __CCE_AICORE__ == 220 +#if __CCE_AICORE__ == 220 || defined(__DAV_C310__) template __aicore__ inline void InitKfcClient(T &matmulClient, U *tiling, TPipe *tpipe, KfcCommClient *client, int instIdx, GM_ADDR workspace) @@ -228,9 +340,13 @@ __aicore__ inline void InitKfcClient(T &matmulClient, U *tiling, TPipe *tpipe, K matmulClient.client = client; matmulClient.instIdx = instIdx; matmulClient.cubeTiling.SetTiling((TCubeTiling *)tiling); - matmulClient.mmCntAddr_ = reinterpret_cast<__gm__ KfcMsg*>(GetMatmulIncAddr(workspace, GetBlockIdxImpl(), instIdx)); matmulClient.InitStatic(); +#if defined(__DAV_C310__) + matmulClient.devEvtID = instIdx; + matmulClient.waitFixpId = static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP) + instIdx; +#else matmulClient.devEvtID = instIdx * 2 + GetSubBlockIdxImpl(); +#endif } #endif __aicore__ constexpr bool PhyPosIsL1(TPosition pos) @@ -287,6 +403,13 @@ __aicore__ constexpr bool PhyPosIsGM(TPosition pos) return false; } +template +__aicore__ constexpr bool PhyPosIsL1OrUB(TPosition pos) +{ + return (PhyPosIsL1(pos) || (Impl::Detail::MatmulFeatureTrait::IsSupportUBToL1Singleshape() && + PhyPosIsUB(pos))); +} + __aicore__ constexpr bool PhyPosIsL0C(TPosition pos) { #if (__CCE_AICORE__ == 220) @@ -297,6 +420,36 @@ __aicore__ constexpr bool PhyPosIsL0C(TPosition pos) return false; } +#if defined(__DAV_C310__) +constexpr uint8_t INTRA_MODE = 4; +template +__aicore__ constexpr bool PhyMxScalePosIsL1() +{ + if constexpr (HasScalePosition::value) { + return PhyPosIsL1(INPUT_TYPE::scalePosition); + } + return false; +} + +template +__aicore__ constexpr bool PhyMxScalePosIsUB() +{ + if constexpr (HasScalePosition::value) { + return PhyPosIsUB(INPUT_TYPE::scalePosition); + } + return false; +} + +template +__aicore__ constexpr bool PhyMxScalePosIsGM() +{ + if constexpr (HasScalePosition::value) { + return PhyPosIsGM(INPUT_TYPE::scalePosition); + } + return false; +} +#endif + template __aicore__ __inline__ void SyncCubeWithVec() { // Ensure that the Cube starts to process the message after receiving the @@ -329,6 +482,14 @@ __aicore__ constexpr int32_t GetBitSize() if constexpr (IsSameTypeV) { return ONE_BYTE_BIT_SIZE / 2; } +#if defined(__DAV_C310__) + if constexpr (IsTypeOneOfV) { + return ONE_BYTE_BIT_SIZE; + } + if constexpr (IsTypeOneOfV) { + return ONE_BYTE_BIT_SIZE / 2; + } +#endif return ONE_BYTE_BIT_SIZE * 2; } @@ -429,23 +590,26 @@ __aicore__ inline uint16_t CeilAlign(uint16_t num1, uint16_t num2) template __aicore__ inline constexpr bool IsL0ACache() { - return (ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK) && - (ToMatmulConfig(MM_CFG).singleCoreM <= ToMatmulConfig(MM_CFG).basicM); + if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { + return ToMatmulConfig(MM_CFG).basicM * ToMatmulConfig(MM_CFG).basicK * sizeof(T) * Impl::DB_FACTOR <= L0AUF_SIZE; + } else { + return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR; + } } template __aicore__ inline constexpr bool IsL0BCache() { - if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - return ToMatmulConfig(MM_CFG).basicK * ToMatmulConfig(MM_CFG).basicN * sizeof(T) * Impl::DB_FACTOR <= L0BUF_SIZE; - } else { - return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR; - } + return (ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK) && + (ToMatmulConfig(MM_CFG).singleCoreM <= ToMatmulConfig(MM_CFG).basicM); } template __aicore__ inline constexpr bool IsL0Cache() { + if constexpr (Impl::Detail::MatmulFeatureTrait::IsSupportLoad2dV2() || HasScalePosition::value) { + return false; + } if constexpr ((!ToMatmulConfig(MM_CFG).doNorm && !ToMatmulConfig(MM_CFG).doMultiDataLoad) || ToMatmulConfig(MM_CFG).intraBlockPartSum || A_TYPE::layout != LayoutMode::NONE || ToMatmulConfig(MM_CFG).isA2B2Shared) { @@ -482,9 +646,11 @@ constexpr bool IsBasicBlockEnable = DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecia template constexpr bool IsIntrablock = DoMatmulNorm(MM_CFG) && ToMatmulConfig(MM_CFG).intraBlockPartSum; +template +constexpr bool IsMxTransEnableWithND = (HasScalePosition::value && T::isTrans && T::format == CubeFormat::ND); -template -constexpr bool IsKdimReorderLoad = ToMatmulConfig(MM_CFG).enableKdimReorderLoad; +template +constexpr bool IsMxTransDisableWithND = (HasScalePosition::value && !T::isTrans && T::format == CubeFormat::ND); enum class PolicyType { MATMUL_DEFAULT = 0, @@ -493,10 +659,74 @@ enum class PolicyType { MATMUL_LOWER_TRIANGULAR = 3, }; +template +constexpr bool IsKdimReorderLoad = ToMatmulConfig(MM_CFG).enableKdimReorderLoad; + template constexpr bool NormInitScene = DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG); template constexpr bool MdlInitScene = DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG); + +template +__aicore__ inline constexpr bool IsL1BNeedTrans() +{ + if constexpr (!Impl::Detail::MatmulFeatureTrait::IsMmadInstrSupportAntiQuant()) { + if constexpr (GetBitSize() == GetBitSize()) { + return false; + } else { + return true; + } + } + return false; +} + +template +__aicore__ inline constexpr auto GetTransBDataType() +{ + if constexpr (HasScalePosition::value) { + B_TYPE mxBType; + return mxBType; + } else if constexpr (IsL1BNeedTrans()) { + A_TYPE AType; + return AType; + } else { + B_TYPE BType; + return BType; + } +} + +enum class McgShfMode { + SINGLE_DST_MODE = 0, + DUAL_DST_SPLIT_M, + DUAL_DST_SPLIT_N, + RESERVED +}; + +template +__aicore__ inline constexpr bool IsScaleTag() { + return INPUT_TYPE::TAG == InputTypeTag::scaleA || INPUT_TYPE::TAG == InputTypeTag::scaleB; +} + +template +__aicore__ inline constexpr bool InputPhyPosIsL1() +{ + if constexpr (IsScaleTag()) { + return PhyPosIsL1(INPUT_TYPE::scalePosition); + } else { + return PhyPosIsL1(INPUT_TYPE::pos); + } +} + +template +__aicore__ inline constexpr bool InputPhyPosIsUB() +{ + if constexpr (IsScaleTag()) { + return PhyPosIsUB(INPUT_TYPE::scalePosition); + } else { + return PhyPosIsUB(INPUT_TYPE::pos); + } +} + } // namespace AscendC -#endif // _MATMUL_UTILS_H_ +#endif // _MATMUL_UTILS_H_ \ No newline at end of file -- Gitee From 92ea42afdd7467d39759e366d9691be3b80e5abd Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 06:11:04 +0000 Subject: [PATCH 39/56] add Signed-off-by: jiangchengcheng-on --- impl/matmul/matmul_impl_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impl/matmul/matmul_impl_base.h b/impl/matmul/matmul_impl_base.h index 1962a375..4d18bc0e 100644 --- a/impl/matmul/matmul_impl_base.h +++ b/impl/matmul/matmul_impl_base.h @@ -665,4 +665,4 @@ __aicore__ inline bool MatmulImplBaseScheduleOnce(enPartialSum); } } // namespace AscendC -#endif // _MATMUL_IMPL_BASE_H_ \ No newline at end of file +#endif // _MATMUL_IMPL_BASE_H_ -- Gitee From 71981455293ec87e0b6e29c2cff5c82bd521c81b Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 06:15:43 +0000 Subject: [PATCH 40/56] add Signed-off-by: jiangchengcheng-on --- lib/matmul/constant_tiling.h | 89 ++++++++++++++++++++++++++++++++++-- lib/matmul/matmul.h | 5 +- 2 files changed, 87 insertions(+), 7 deletions(-) diff --git a/lib/matmul/constant_tiling.h b/lib/matmul/constant_tiling.h index 63eaa5df..ea71c46f 100644 --- a/lib/matmul/constant_tiling.h +++ b/lib/matmul/constant_tiling.h @@ -28,8 +28,8 @@ __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig tiling.baseM = mmCFG.basicM; tiling.baseN = mmCFG.basicN; tiling.baseK = mmCFG.basicK; - tiling.dbL0A = GetL0ADb(mmCFG, TOTAL_L0A_SIZE); - tiling.dbL0B = GetL0BDb(mmCFG, TOTAL_L0B_SIZE); + tiling.dbL0A = GetL0ADb(mmCFG, TOTAL_L0A_SIZE); + tiling.dbL0B = GetL0BDb(mmCFG, TOTAL_L0B_SIZE); tiling.isBias = mmCFG.enableSetBias; } return tiling; @@ -58,8 +58,9 @@ __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig tiling.depthB1 = CeilNoLog(l1Factor.kBL1, kL0) * l1Factor.nBL1 * l1Factor.dbBL1; } tiling.iterateOrder = GetIterateOrder(l1Factor, mmCFG); - tiling.dbL0A = GetL0ADb(mmCFG, TOTAL_L0A_SIZE); - tiling.dbL0B = GetL0BDb(mmCFG, TOTAL_L0B_SIZE); + tiling.dbL0A = GetL0ADb(mmCFG, TOTAL_L0A_SIZE); + tiling.dbL0B = GetL0BDb(mmCFG, TOTAL_L0B_SIZE); + GetMxMatmulApiTiling(tiling, l1Size); // keep the same with runtime tiling, fix l0c db tiling.dbL0C = 1; tiling.transLength = GetTransLength(mmCFG, l1Factor); @@ -71,5 +72,83 @@ __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig tiling.shareUbSize = 0; return tiling; } + +template +__aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = Impl::L1_SIZE) +{ + constexpr auto singleM = Std::tuple_element<0, SingleShape>::type::value; + constexpr auto singleN = Std::tuple_element<1, SingleShape>::type::value; + constexpr auto singleKa = Std::tuple_element<2, SingleShape>::type::value; + constexpr auto singleKb = []() { + if constexpr (Std::tuple_size_v > 3) { + return Std::tuple_element<3, SingleShape>::type::value; + } else { + return singleKa; + } + }(); + constexpr auto l1M = Std::tuple_element<0, L1Shape>::type::value; + constexpr auto l1N = Std::tuple_element<1, L1Shape>::type::value; + constexpr auto l1Ka = Std::tuple_element<2, L1Shape>::type::value; + constexpr auto l1Kb = []() { + if constexpr (Std::tuple_size_v > 3) { + return Std::tuple_element<3, L1Shape>::type::value; + } else { + return l1Ka; + } + }(); + constexpr auto baseM = Std::tuple_element<0, BaseShape>::type::value; + constexpr auto baseN = Std::tuple_element<1, BaseShape>::type::value; + constexpr auto baseK = Std::tuple_element<2, BaseShape>::type::value; + + if constexpr (l1M == 0 || l1N == 0 || l1Ka == 0 || l1Kb == 0) { + return GetMatmulApiTiling(mmCFG, l1Size); + } + MatmulApiStaticTiling tiling; + tiling.cfg = mmCFG; + tiling.baseM = baseM; + tiling.baseN = baseN; + tiling.baseK = baseK; + tiling.dbL0A = 2 * baseM * baseK * GetBitSize() / ONE_BYTE_BIT_SIZE <= TOTAL_L0A_SIZE ? + Impl::DB_ON : Impl::DB_OFF; + tiling.dbL0B = 2 * baseK * baseN * GetBitSize() / ONE_BYTE_BIT_SIZE <= TOTAL_L0B_SIZE ? + Impl::DB_ON : Impl::DB_OFF; + tiling.isBias = mmCFG.enableSetBias; + tiling.M = singleM; + tiling.N = singleN; + tiling.Ka = singleKa; + tiling.Kb = singleKb; + tiling.singleCoreM = singleM; + tiling.singleCoreN = singleN; + tiling.singleCoreK = singleKa; + tiling.stepM = CeilNoLog(l1M, baseM); + tiling.stepN = CeilNoLog(l1N, baseN); + tiling.stepKa = CeilNoLog(l1Ka, baseK); + tiling.stepKb = CeilNoLog(l1Kb, baseK); + tiling.depthA1 = tiling.stepM * tiling.stepKa * 2; // 2 DoubleBuffer + tiling.depthB1 = tiling.stepN * tiling.stepKb * 2; // 2 DoubleBuffer + tiling.iterateOrder = 0; + // keep the same with runtime tiling, fix l0c db + tiling.dbL0C = 1; + int32_t biasLength = 0; + if (mmCFG.enableSetBias) { + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + biasLength = 0; + } else { + int32_t channelWiseSize = GetChannelWise(mmCFG) * 1 * GetTypeSize(); + biasLength = tiling.stepN * baseN * channelWiseSize; + } + } + // C matrix ND2NZ + int32_t c1Length = 0; + if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::pos == TPosition::GM) { + c1Length = baseM * baseN * GetBitSize() / ONE_BYTE_BIT_SIZE; + } + tiling.transLength = MaxValue(c1Length, biasLength); + tiling.shareMode = 0; + tiling.shareL1Size = GetL1UsedSize(mmCFG, tiling.depthA1, tiling.depthB1); + tiling.shareL0CSize = baseM * baseN * GetBitSize() / ONE_BYTE_BIT_SIZE; + tiling.shareUbSize = 0; + return tiling; +} } // namespace matmul -#endif // LIB_MATMUL_CONSTANT_TILING_H \ No newline at end of file +#endif // LIB_MATMUL_CONSTANT_TILING_H diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index a9647c4e..8b74c1db 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -43,7 +43,7 @@ public: using CType = C_TYPE; using BiasType = BIAS_TYPE; private: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; using SrcBT = typename B_TYPE::T; @@ -60,7 +60,8 @@ public: __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false) {} __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false) {} __aicore__ inline void SetBias(const GlobalTensor& biasGlobal) {} - __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {} + template + __aicore__ inline void SetSelfDefineData(const T dataPtr) {} __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} __aicore__ inline void SetSparseIndex(const GlobalTensor& indexGlobal); __aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar) {} -- Gitee From 0e32c0d43874a651b1bf4ab41d4a12b68086676a Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 19 May 2025 06:15:54 +0000 Subject: [PATCH 41/56] add Signed-off-by: jiangchengcheng-on --- lib/matmul/matmul_config.h | 12 ++++++++++ lib/matmul/matmul_intf.h | 8 +++---- lib/matmul/matmul_tiling_base.h | 29 ++++++++++++++++++++---- lib/matmul/matmul_tilingdata.h | 2 +- lib/matmul/tiling.h | 39 +++++++++++++++++++++++++++------ 5 files changed, 74 insertions(+), 16 deletions(-) diff --git a/lib/matmul/matmul_config.h b/lib/matmul/matmul_config.h index 8b2baed3..ab7711fd 100644 --- a/lib/matmul/matmul_config.h +++ b/lib/matmul/matmul_config.h @@ -25,6 +25,8 @@ #define ITERATE_SIZE 2 +constexpr uint32_t SHARED_CO1_BUFFER_SIZE = 64 * 1024; + enum class CubeFormat { ND = 0, NZ, @@ -34,6 +36,8 @@ enum class CubeFormat { ND_ALIGN, SCALAR, VECTOR, + ROW_MAJOR = ND, // ND + COLUMN_MAJOR = 8, // DN }; enum class LayoutMode { @@ -134,6 +138,9 @@ struct MatmulConfig { bool isA2B2Shared = false; bool isEnableChannelSplit = false; bool enableKdimReorderLoad = false; + bool isCO1Shared = false; + uint32_t sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE; + bool isNBatchOut = false; }; enum class MatmulConfigMode { @@ -161,6 +168,7 @@ struct MatmulBatchParams { bool isNBatch; BatchMode batchMode; bool isBiasBatch = true; + bool isNBatchOut = false; }; struct MatmulFuncParams { @@ -179,6 +187,10 @@ struct MatmulFuncParams { bool enableKdimReorderLoad = false; }; +struct MatmulBiasParams { + bool enableSetBias = true; +}; + struct MatrixOffset { int32_t offset; int32_t row, col; diff --git a/lib/matmul/matmul_intf.h b/lib/matmul/matmul_intf.h index cfc23a5a..c3e8520c 100644 --- a/lib/matmul/matmul_intf.h +++ b/lib/matmul/matmul_intf.h @@ -16,7 +16,7 @@ #define LIB_MATMUL_MATMUL_INTF_H #include "../impl/kfc/kfc_register_obj.h" -#if __CCE_AICORE__ == 220 +#if __CCE_AICORE__ == 220 || defined(__DAV_C310__) #include "../impl/kfc/kernel_kfc.h" #else #include "lib/matmul/matmul.h" @@ -26,7 +26,7 @@ namespace AscendC { #define REGIST_MATMUL_OBJ_STATIC REGIST_CUBE_OBJ #define REGIST_MATMUL_OBJ REGIST_CUBE_OBJ #ifdef ASCENDC_CPU_DEBUG -#if __CCE_AICORE__ == 220 +#if __CCE_AICORE__ == 220 || defined(__DAV_C310__) #ifdef ASCENDC_CUBE_ONLY template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> @@ -45,7 +45,7 @@ using Matmul = MatmulImpl, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> @@ -56,7 +56,7 @@ template ; #endif -#elif defined(__DAV_C220_VEC__) +#elif defined(SPLIT_CORE_VEC) template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> using Matmul = MatmulClient; diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index b6708ce5..243981e4 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -21,7 +21,9 @@ #include "tiling/platform/platform_ascendc.h" namespace matmul_tiling { +#ifndef __ASCC_DEVICE__ using half = double; +#endif constexpr int32_t UINT8_BYTES = 1; constexpr int32_t INT8_BYTES = 1; constexpr int32_t FP32_BYTES = 4; @@ -63,23 +65,32 @@ enum class DataType : int32_t { DT_INT2 = 31, // int2 type DT_UINT2 = 32, // uint2 type DT_BFLOAT16 = 33, // bf16 type - DT_MAX = 34 // Mark the boundaries of data types + DT_HIFLOAT8 = 34, // hifp8 type + DT_FLOAT8_E4M3FN = 35, // fp8_e4m3 type + DT_FLOAT8_E5M2 = 36, // fp8_e5m2 type + DT_FLOAT4_E2M1 = 37, // fp4_e2m1 type + DT_FLOAT8_E8M0 = 38, // fp8_e8m0 type + DT_FLOAT4_E1M2 = 39, // fp4_e1m2 type + DT_MAX = 40 // Mark the boundaries of data types }; +#ifndef __ASCC_DEVICE__ const std::map DTYPE_BYTE_TAB = { {DataType::DT_FLOAT, 4}, {DataType::DT_FLOAT16, 2}, {DataType::DT_INT8, 1}, {DataType::DT_INT16, 2}, {DataType::DT_UINT16, 2}, {DataType::DT_UINT8, 1}, {DataType::DT_INT32, 4}, {DataType::DT_INT64, 8}, {DataType::DT_UINT32, 4}, {DataType::DT_UINT64, 8}, {DataType::DT_BF16, 2}, {DataType::DT_BFLOAT16, 2}, - {DataType::DT_INT4, 1} + {DataType::DT_INT4, 1}, {DataType::DT_FLOAT4_E2M1, 1}, {DataType::DT_FLOAT4_E1M2, 1}, {DataType::DT_HIFLOAT8, 1}, + {DataType::DT_FLOAT8_E4M3FN, 1}, {DataType::DT_FLOAT8_E5M2, 1}, {DataType::DT_FLOAT8_E8M0, 1} }; const std::map DTYPE_BIT_TAB = { {DataType::DT_FLOAT, 32}, {DataType::DT_FLOAT16, 16}, {DataType::DT_INT8, 8}, {DataType::DT_INT16, 16}, {DataType::DT_UINT16, 16}, {DataType::DT_UINT8, 8}, {DataType::DT_INT32, 32}, {DataType::DT_INT64, 64}, {DataType::DT_UINT32, 32}, {DataType::DT_UINT64, 64}, {DataType::DT_BF16, 16}, {DataType::DT_BFLOAT16, 16}, - {DataType::DT_INT4, 4} + {DataType::DT_INT4, 4}, {DataType::DT_FLOAT4_E2M1, 4}, {DataType::DT_FLOAT4_E1M2, 4}, {DataType::DT_HIFLOAT8, 8}, + {DataType::DT_FLOAT8_E4M3FN, 8}, {DataType::DT_FLOAT8_E5M2, 8}, {DataType::DT_FLOAT8_E8M0, 8} }; - +#endif // __ASCC_DEVICE__ enum class TPosition : int32_t { GM, A1, @@ -116,6 +127,8 @@ enum class CubeFormat : int32_t { ND_ALIGN, SCALAR, VECTOR, + ROW_MAJOR = ND, // ND + COLUMN_MAJOR = 8, // DN }; enum class MatrixTraverse : int32_t { @@ -127,6 +140,7 @@ enum class MatrixTraverse : int32_t { enum class MatrixMadType : int32_t { NORMAL = 0, HF32 = 1, // V220 HF32 + MXMODE = 2, // v310 MxMatmulFlag }; enum class DequantType : int32_t { @@ -151,6 +165,10 @@ struct MatTilingType { DataType dataType = DataType::DT_FLOAT; bool isTrans = false; bool isDB = false; + bool hasSetScaleType = false; + TPosition scalePos = TPosition::GM; + CubeFormat scaleType = CubeFormat::ND; + bool isScaleTrans = false; }; struct BufferPool { @@ -202,6 +220,8 @@ public: virtual ~MatmulApiTilingBase(); int32_t SetAType(TPosition pos, CubeFormat type, DataType dataType, bool isTrans = false); int32_t SetBType(TPosition pos, CubeFormat type, DataType dataType, bool isTrans = false); + int32_t SetScaleAType(TPosition scalePos, CubeFormat scaleType, bool isScaleTrans = false); + int32_t SetScaleBType(TPosition scalePos, CubeFormat scaleType, bool isScaleTrans = true); int32_t SetCType(TPosition pos, CubeFormat type, DataType dataType); int32_t SetBiasType(TPosition pos, CubeFormat type, DataType dataType); int32_t SetDequantType(DequantType dequantType) @@ -335,6 +355,7 @@ public: int32_t mmConfigType = 1; // 0: Norm; 1: MDL bool enableL1CacheUB = false; bool enVecND2NZ = false; + bool isBMNKBmm = false; protected: virtual int64_t Compute() = 0; diff --git a/lib/matmul/matmul_tilingdata.h b/lib/matmul/matmul_tilingdata.h index de75687f..d130ccb5 100644 --- a/lib/matmul/matmul_tilingdata.h +++ b/lib/matmul/matmul_tilingdata.h @@ -68,7 +68,7 @@ TILING_DATA_FIELD_DEF(int32_t, CLayoutInfoN); TILING_DATA_FIELD_DEF(int32_t, CLayoutInfoG); TILING_DATA_FIELD_DEF(int32_t, CLayoutInfoS2); TILING_DATA_FIELD_DEF(int32_t, BatchNum); -TILING_DATA_FIELD_DEF(int32_t, reserved); +TILING_DATA_FIELD_DEF(int32_t, mxTypePara); END_TILING_DATA_DEF; } diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h index 619f5266..76ae6ec9 100644 --- a/lib/matmul/tiling.h +++ b/lib/matmul/tiling.h @@ -20,7 +20,7 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF, const ScheduleType scheduleType = ScheduleType::INNER_PRODUCT, const bool enUnitFlag = true, - const bool enableMixDualMaster = false) + const bool enableMixDualMaster = false, const bool isNBatchOut = false) { return { .doNorm = true, @@ -68,7 +68,11 @@ __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = f .isPartialOutput = false, .enableMixDualMaster = enableMixDualMaster, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = false + .isEnableChannelSplit = false, + .enableKdimReorderLoad = false, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = isNBatchOut }; } @@ -123,7 +127,11 @@ __aicore__ constexpr MatmulConfig GetMDLConfig(const bool intrinsicsLimit = fals .isPartialOutput = false, .enableMixDualMaster = enableMixDualMaster, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = enableKdimReorderLoad + .isEnableChannelSplit = false, + .enableKdimReorderLoad = enableKdimReorderLoad, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = false }; } @@ -177,7 +185,11 @@ __aicore__ constexpr MatmulConfig GetSpecialMDLConfig(const bool intrinsicsLimit .isPartialOutput = false, .enableMixDualMaster = false, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = false + .isEnableChannelSplit = false, + .enableKdimReorderLoad = false, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = false }; } @@ -231,7 +243,11 @@ __aicore__ constexpr MatmulConfig GetBasicConfig(const uint32_t basicM, const ui .isPartialOutput = false, .enableMixDualMaster = false, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = false + .isEnableChannelSplit = false, + .enableKdimReorderLoad = false, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = false }; } @@ -286,7 +302,11 @@ __aicore__ constexpr MatmulConfig GetSpecialBasicConfig(const uint32_t basicM, c .isPartialOutput = false, .enableMixDualMaster = false, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = false + .isEnableChannelSplit = false, + .enableKdimReorderLoad = false, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = false }; } @@ -340,7 +360,11 @@ __aicore__ constexpr MatmulConfig GetIBShareNormConfig(const bool intrinsicsLimi .isPartialOutput = false, .enableMixDualMaster = false, .isA2B2Shared = false, - .isEnableChannelSplit = false, .enableKdimReorderLoad = false + .isEnableChannelSplit = false, + .enableKdimReorderLoad = false, + .isCO1Shared = false, + .sharedCO1BufferSize = SHARED_CO1_BUFFER_SIZE, + .isNBatchOut = false }; } @@ -409,6 +433,7 @@ struct MatmulApiStaticTiling { int32_t CLayoutInfoG = -1; int32_t CLayoutInfoS2 = -1; int32_t BatchNum = -1; + int32_t mxTypePara = -1; MatmulConfig cfg = CFG_NORM; }; -- Gitee From 31e6142347281919230e4f5faf5645eeb23b7f6b Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:08:38 +0000 Subject: [PATCH 42/56] add Signed-off-by: jiangchengcheng-on --- tests/matmul/bias/test_bias_c2_buffer.cpp | 2 +- tests/matmul/iterator/test_batch_m_loop.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/matmul/bias/test_bias_c2_buffer.cpp b/tests/matmul/bias/test_bias_c2_buffer.cpp index c3578e3c..7b47de1a 100644 --- a/tests/matmul/bias/test_bias_c2_buffer.cpp +++ b/tests/matmul/bias/test_bias_c2_buffer.cpp @@ -91,7 +91,7 @@ private: using B_TYPE = MatmulType; using C_TYPE = MatmulType; using BIAS_TYPE = MatmulType; - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; MatmulImpl mm1_; MatmulImpl mm2_; diff --git a/tests/matmul/iterator/test_batch_m_loop.cpp b/tests/matmul/iterator/test_batch_m_loop.cpp index 8b9c1098..8288694d 100644 --- a/tests/matmul/iterator/test_batch_m_loop.cpp +++ b/tests/matmul/iterator/test_batch_m_loop.cpp @@ -108,4 +108,4 @@ TEST_F(TestMLoopBatch, batch_m_loop) { EXPECT_EQ(mm.GetOuterIdx(), 3); EXPECT_EQ(mm.GetBaseShape(), 48); EXPECT_EQ(mm.GetBaseBlockShape(), 3); -} \ No newline at end of file +} \ No newline at end of file -- Gitee From d879cdd9113dd0ae812cfdf6c84903830f29869c Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:10:01 +0000 Subject: [PATCH 43/56] new Signed-off-by: jiangchengcheng-on --- .../tiling/matmul_tiling_algorithm_new.cpp | 895 ++++++++++++++++++ 1 file changed, 895 insertions(+) create mode 100644 impl/matmul/tiling/matmul_tiling_algorithm_new.cpp diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp new file mode 100644 index 00000000..aaa75b30 --- /dev/null +++ b/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp @@ -0,0 +1,895 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_tiling_algorithm.cpp + * \brief + */ +#include "matmul_tiling_algorithm.h" + +#include +#include +#include +#include +#include + +#include "securec.h" + +#include "impl/host_log.h" +#include "math_util.h" + +using namespace std; + +namespace matmul_tiling { +constexpr uint32_t IDX_ZERO = 0; +constexpr uint32_t IDX_ONE = 1; +constexpr uint32_t IDX_TWO = 2; +constexpr uint32_t IDX_THREE = 3; +constexpr uint32_t IDX_FOUR = 4; +constexpr uint32_t IDX_FIVE = 5; +constexpr uint32_t IDX_SIX = 6; +constexpr uint32_t IDX_SEVEN = 7; +constexpr int32_t MAX_BIAS_N = 16; +constexpr int32_t MTE1_L0A_BANDWIDTH = 256; +constexpr int32_t MTE1_L0B_BANDWIDTH = 128; +constexpr int32_t INPUTDTYPE_BYTES = 2; +constexpr int32_t MIN_MTE1_LOAD = 32; +constexpr int32_t REDUCE_BLOCK_SIZE = 16; +constexpr int32_t INT8_REDUCE_BLOCK_SIZE = 32; +constexpr int32_t INT4_REDUCE_BLOCK_SIZE = 64; +constexpr int32_t FLOAT32_REDUCE_BLOCK_SIZE = 8; +constexpr int32_t MIN_FRACTAL_SIZE = C0_SIZE * REDUCE_BLOCK_SIZE; +constexpr uint32_t BEST_VALUE_LENGTH = 13; +constexpr int32_t BEST_VALUE_LIST[BEST_VALUE_LENGTH] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; +constexpr uint32_t DIM_FACTOR_LENGTH = 4; +constexpr uint64_t UINT64_TYPES = 8; +constexpr int32_t L0B_ALIGN_SIZE = 2; + +// minimum factor number requirement for the data amount in single-core +constexpr int32_t L0_FACTOR_NUM_LIMIT = 2; +constexpr int32_t L1_FACTOR_NUM_LIMIT = 4; +// the lower bound of the factor number check +constexpr int32_t L0_FACTOR_LIMIT = 64; +constexpr int32_t L1_FACTOR_LIMIT = 128; + +static MatmulTemplateCfg g_tempCfg; + +constexpr int32_t MTE1_FIXPIPE_BANDWIDTH = 128; // 128 Bytes per cycle + +constexpr int32_t NUM_TWO = 2; + +constexpr int32_t ATTACH_FLAG_ZERO = 0; +constexpr int32_t ATTACH_FLAG_ONE = 1; +constexpr int32_t ATTACH_FLAG_TWO = 2; + +constexpr int32_t INT8_ALIGN_SIZE = 32; +constexpr int32_t FP32_ALIGN_SIZE = 16; +constexpr int32_t INT4_ALIGN_SIZE = 64; +constexpr int32_t DATA_COPY_ALIGN_SIZE = 256; // 256B + +constexpr int DT_FLOAT_INVALID_BASEK = 8; + +constexpr int32_t MX_BASEK_FACTOR = 64; +constexpr int32_t MX_L1_BUFFER_NUM = 4; +constexpr uint32_t SCALE_FACTOR_MAX_VALUE = 127; + +MatmulTilingAlgorithm::MatmulTilingAlgorithm(MatmulApiTilingBase* tilingIns) +{ + ASCENDC_HOST_ASSERT(tilingIns != nullptr, {}, "tiling instance is null"); + tilingIns_ = tilingIns; +} + +int32_t MatmulTilingAlgorithm::GetBestValue(int32_t base) const +{ + for (uint32_t i = 0; i < BEST_VALUE_LENGTH; ++i) { + if (i == 0 || BEST_VALUE_LIST[i] <= base) { + continue; + } + return BEST_VALUE_LIST[i - 1]; + } + return BEST_VALUE_LIST[BEST_VALUE_LENGTH - 1]; +} + +void MatmulTilingAlgorithm::GetTwoFactors(int32_t (&res)[2], int32_t base, int32_t dim, int32_t maxNum) const +{ + if (dim == 1) { + res[0] = 1; + res[1] = 1; + return; + } + + res[0] = 0; + res[1] = 0; + + int cnt = 0; + for (auto up = base + 1; up <= maxNum && up <= dim; ++up) { + if (dim % up == 0) { + res[cnt++] = up; + break; + } + } + + for (auto down = base; down >= 1; --down) { + if (dim % down == 0) { + res[cnt++] = down; + if (cnt == sizeof(res) / sizeof(res[0])) { + break; + } + } + } +} + +void MatmulTilingAlgorithm::GetABL1KAlignValue(int32_t& kaAlignValue, int32_t& kbAlignValue) const +{ + kaAlignValue = 1; + kbAlignValue = 1; + if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || tilingIns_->bType_.dataType == DataType::DT_FLOAT) { + // when in FP32 mode, k_a must be an even number if k-alignment is needed. So make ka_align_value as 2. + kaAlignValue = tilingIns_->aType_.isTrans ? 2 : 1; + // Same as previous one, make kb_align_value as 2 when k-alignment is needed + kbAlignValue = (tilingIns_->aType_.isTrans || !tilingIns_->bType_.isTrans) ? 2 : 1; + } +} + +void MatmulTilingAlgorithm::GetL0StatusFromParasCombo(L0StatusPack& l0Status, int32_t* parasCombo) const +{ + l0Status.InitLoadStatus(); + size_t kIdx = 0; + l0Status.dbL0A = parasCombo[kIdx++]; + l0Status.dbL0B = parasCombo[kIdx++]; + l0Status.dbL0C = parasCombo[kIdx++]; + l0Status.maxMk = parasCombo[kIdx++]; + l0Status.maxNk = parasCombo[kIdx++]; + l0Status.maxMn = parasCombo[kIdx++]; + l0Status.maxAxisIdx = parasCombo[kIdx++]; + l0Status.maxAxisNum = parasCombo[kIdx++]; + l0Status.maxAxisPnt = parasCombo[kIdx++]; + l0Status.maxN = parasCombo[kIdx++]; + l0Status.maxAxisPnt = min(l0Status.maxAxisPnt, l0Status.maxAxisNum); +} + +void MatmulTilingAlgorithm::SetResFactors(L0Factors& resFactors, const L0StatusPack& l0Status) const +{ + resFactors.finalML0 = l0Status.finalML0; + resFactors.finalKL0 = l0Status.finalKL0; + resFactors.finalNL0 = l0Status.finalNL0; + resFactors.finalLoadSize = l0Status.finalLoadSize; + resFactors.finalL0cUse = l0Status.finalL0cUse; + resFactors.finalMte1Loop = l0Status.finalMte1Loop; + resFactors.finalMul = l0Status.finalMul; + resFactors.finalMte1Cycles = l0Status.finalMte1Cycles; +} + +int32_t MatmulTilingAlgorithm::GetLoadSize(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status) const +{ + const bool al0FullLoad = + (static_cast(coreStatus.m * coreStatus.k) * static_cast(C0_SIZE * C0_BYTE_SIZE)) <= + static_cast(tilingIns_->bufferPool_.l0ASize); + const bool bl0FullLoad = + (static_cast(coreStatus.n * coreStatus.k) * static_cast(C0_SIZE * C0_BYTE_SIZE)) <= + static_cast(tilingIns_->bufferPool_.l0BSize); + const bool kFullLoad = (l0Status.kL0 >= coreStatus.k); + if (al0FullLoad || bl0FullLoad) { + return coreStatus.m + coreStatus.n; + } else if (kFullLoad) { + return min(coreStatus.n + MathUtil::CeilDivision(coreStatus.n, l0Status.nL0) * coreStatus.m, + coreStatus.m + MathUtil::CeilDivision(coreStatus.m, l0Status.mL0) * coreStatus.n); + } else { + return MathUtil::CeilDivision(coreStatus.m, l0Status.mL0) * coreStatus.n + + MathUtil::CeilDivision(coreStatus.n, l0Status.nL0) * coreStatus.m; + } +} + +bool MatmulTilingAlgorithm::CheckBaseMNKL1Size(SingleCoreStatus& singleCoreStatus) const +{ + L0StatusPack& l0Status = singleCoreStatus.l0Status; + int32_t a1Length = static_cast(l0Status.mL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE); + int32_t b1Length = static_cast(l0Status.nL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE); + int32_t biasLength = (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) + ? l0Status.nL0 * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType) + : 0; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + if (tilingIns_->aType_.pos == TPosition::TSCM) { + a1Length = 0; + } + if (tilingIns_->bType_.pos == TPosition::TSCM) { + b1Length = 0; + } + // Only V220/V300 bias uses L1 space. + if (tilingIns_->biasType_.pos == TPosition::TSCM || !tilingIns_->isBias) { + biasLength = 0; + } + const int32_t totalLength = a1Length + b1Length + biasLength + dequantSize; + return totalLength <= tilingIns_->bufferPool_.l1Size; +} + +bool MatmulTilingAlgorithm::CheckK0Align(int32_t k0) const +{ + if ((tilingIns_->aType_.dataType == DataType::DT_FLOAT && tilingIns_->aType_.type == CubeFormat::NZ && + tilingIns_->aType_.isTrans) || + (tilingIns_->bType_.dataType == DataType::DT_FLOAT && tilingIns_->bType_.type == CubeFormat::NZ && + !tilingIns_->bType_.isTrans)) { + return k0 % NUM_TWO == 0; + } + return true; +} + +void MatmulTilingAlgorithm::GetFinalMkn(SingleCoreStatus& singleCoreStatus, const CoreStatusPack& coreStatus, + const int32_t& k0, const int32_t& majorDimFactor, const int32_t& minorDimFactor) const +{ + if (k0 == 0) { + return; + } + L0StatusPack& l0Status = singleCoreStatus.l0Status; + if (l0Status.maxAxisIdx == 0) { + l0Status.mL0 = majorDimFactor; + l0Status.nL0 = minorDimFactor; + } else { + l0Status.mL0 = minorDimFactor; + l0Status.nL0 = majorDimFactor; + } + l0Status.kL0 = k0; + const float tmpL0cUse = static_cast(l0Status.mL0 * l0Status.nL0 * l0Status.dbL0C * + MIN_FRACTAL_SIZE * FP32_BYTES * 1.0 / tilingIns_->bufferPool_.l0CSize); + // NUM_TWO means L0A and L0B double buffer is default-on. + + const int32_t tmpMte1Cycle = + max(2 * 3, l0Status.mL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE / MTE1_L0A_BANDWIDTH) + + max(2 * 3, l0Status.kL0 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE / MTE1_L0B_BANDWIDTH); + const int32_t tmpMadCycle = l0Status.mL0 * l0Status.kL0 * l0Status.nL0; // (m<=4 or n<=2:tmpMte1Cycle > tmpMadCycle) + const int32_t tmpLoadSize = GetLoadSize(coreStatus, l0Status); + // calculate load2d loop: A splitK for K loop; B split K for m loop as to V100 + const int32_t tmpMte1Loop = ((l0Status.nL0 != 1) ? l0Status.kL0 : 1) + ((l0Status.kL0 != 1) ? l0Status.mL0 : 1); + + const bool condition1 = l0Status.finalML0 == 0; + const bool condition2 = + (tmpLoadSize < l0Status.finalLoadSize) || (tmpMte1Cycle < tmpMadCycle && !l0Status.updateUsingMte1); + const bool condition3 = (tmpLoadSize == l0Status.finalLoadSize && tmpMadCycle > l0Status.finalMul && + tmpMadCycle * tmpL0cUse >= l0Status.finalMul * l0Status.finalL0cUse); + const bool condition4 = tmpMadCycle == l0Status.finalMul && tmpLoadSize == l0Status.finalLoadSize && + tmpMte1Loop < l0Status.finalMte1Loop; + // Considering pipeline parallelism between MTE1 and MAD + const bool condition5 = ((tmpMte1Cycle < tmpMadCycle && l0Status.updateUsingMte1) || !l0Status.updateUsingMte1); + const bool condition6 = CheckBaseMNKL1Size(singleCoreStatus); + int32_t lastReduceDim = (tilingIns_->aType_.dataType == DataType::DT_FLOAT || + tilingIns_->bType_.dataType == DataType::DT_FLOAT) ? FLOAT32_REDUCE_BLOCK_SIZE : REDUCE_BLOCK_SIZE; + + const bool condition7 = (tilingIns_->baseN != -1) || (!(coreStatus.n >= lastReduceDim && l0Status.nL0 < + lastReduceDim)); + + const bool condition8 = CheckK0Align(l0Status.kL0); + + const bool validL0 = (condition1 || condition2 || condition3 || condition4) && condition5 && + condition6 && condition7 && condition8; + if (validL0) { + l0Status.finalML0 = l0Status.mL0; + l0Status.finalKL0 = l0Status.kL0; + l0Status.finalNL0 = l0Status.nL0; + l0Status.finalLoadSize = tmpLoadSize; + l0Status.finalL0cUse = tmpL0cUse; + l0Status.finalMul = tmpMadCycle; + l0Status.finalMte1Cycles = tmpMte1Cycle; + l0Status.finalMte1Loop = tmpMte1Loop; + l0Status.updateUsingMte1 = l0Status.updateUsingMte1 || (tmpMte1Cycle < tmpMadCycle); + } +} + +void MatmulTilingAlgorithm::GetL0bAlign(std::vector& factors) const +{ + constexpr int32_t alignSize = 2; + if (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) || + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4)) { + for (auto& num : factors) { + num = MathUtil::Align(num, alignSize); + } + } + return; +} + +void MatmulTilingAlgorithm::GetL0FactorsCand(L0Factors& resFactors, const CoreStatusPack& coreStatus, + SingleCoreStatus& singleCoreStatus, int32_t* parasCombo, const MatmulRunParas& param) const +{ + (void)(param); + L0StatusPack& l0Status = singleCoreStatus.l0Status; + GetL0StatusFromParasCombo(l0Status, parasCombo); + int32_t l0bAlignSize = 1; + if (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) || + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4)) { + l0bAlignSize = L0B_ALIGN_SIZE; + } + int32_t majorDim = coreStatus.m; + int32_t minorDim = MathUtil::Align(coreStatus.n, l0bAlignSize); + int32_t majorDimK = l0Status.maxMk; + int32_t minorDimK = l0Status.maxNk; + int32_t maxN = l0Status.maxN; + int32_t dimFactors[2] = {0}; + if (l0Status.maxAxisIdx != 0) { + majorDim = MathUtil::Align(coreStatus.n, l0bAlignSize); + minorDim = coreStatus.m; + majorDimK = l0Status.maxNk; + minorDimK = l0Status.maxMk; + } + + std::vector majorDimFactors(DIM_FACTOR_LENGTH, 0); + if (tilingIns_->baseN != -1 && l0Status.maxAxisIdx != 0) { + majorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseN, C0_SIZE); + } else if (tilingIns_->baseM != -1 && l0Status.maxAxisIdx == 0) { + majorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseM, C0_SIZE); + } else { + // n dim condition + if (l0Status.maxAxisIdx != 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + GetTwoFactors(dimFactors, min(l0Status.maxAxisPnt, maxN), majorDim, min(l0Status.maxAxisNum, maxN)); + } else { + GetTwoFactors(dimFactors, l0Status.maxAxisPnt, majorDim, l0Status.maxAxisNum); + } + majorDimFactors[0] = dimFactors[0]; + majorDimFactors[1] = dimFactors[1]; + const int32_t majorAmend = GetBestValue(majorDim); + if (l0Status.maxAxisIdx != 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + GetTwoFactors(dimFactors, min(l0Status.maxAxisPnt, maxN), majorAmend, min(l0Status.maxAxisNum, maxN)); + } else { + GetTwoFactors(dimFactors, l0Status.maxAxisPnt, majorAmend, l0Status.maxAxisNum); + } + majorDimFactors[IDX_TWO] = dimFactors[0]; + majorDimFactors[IDX_THREE] = dimFactors[1]; + if (l0Status.maxAxisIdx != 0) { + GetL0bAlign(majorDimFactors); + } + } + sort(majorDimFactors.rbegin(), majorDimFactors.rend()); + majorDimFactors.erase(unique(majorDimFactors.begin(), majorDimFactors.end()), majorDimFactors.end()); + for (auto& majorDimFactor : majorDimFactors) { + if (majorDimFactor == 0 || majorDimFactor > l0Status.maxMn || majorDimFactor > majorDimK || + majorDimFactor > majorDim) { + continue; + } + const int32_t minorFactorMax = min(l0Status.maxMn / majorDimFactor, minorDimK); + std::vector minorDimFactors(DIM_FACTOR_LENGTH, 0); + if (tilingIns_->baseN != -1 && l0Status.maxAxisIdx == 0) { + minorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseN, C0_SIZE); + } else if (tilingIns_->baseM != -1 && l0Status.maxAxisIdx != 0) { + minorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseM, C0_SIZE); + } else { + if (l0Status.maxAxisIdx == 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + GetTwoFactors(dimFactors, min(minorFactorMax, maxN), minorDim, min(minorFactorMax, maxN)); + } else { + GetTwoFactors(dimFactors, minorFactorMax, minorDim, minorFactorMax); + } + minorDimFactors[0] = dimFactors[0]; + minorDimFactors[1] = dimFactors[1]; + const int32_t minorAmend = GetBestValue(minorDim); + if (l0Status.maxAxisIdx == 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + GetTwoFactors(dimFactors, min(minorFactorMax, maxN), minorAmend, min(minorFactorMax, maxN)); + } else { + GetTwoFactors(dimFactors, minorFactorMax, minorAmend, minorFactorMax); + } + minorDimFactors[IDX_TWO] = dimFactors[0]; + minorDimFactors[IDX_THREE] = dimFactors[1]; + if (l0Status.maxAxisIdx == 0) { + GetL0bAlign(minorDimFactors); + } + } + sort(minorDimFactors.rbegin(), minorDimFactors.rend()); + minorDimFactors.erase(unique(minorDimFactors.begin(), minorDimFactors.end()), minorDimFactors.end()); + for (auto& minorDimFactor : minorDimFactors) { + if (minorDimFactor == 0 || minorDimFactor * majorDimFactor > l0Status.maxMn || minorDimFactor > minorDimK || + (minorDimFactor > minorDim) || (minorDimFactor > majorDimK)) { + continue; + } + // consider bias table buffer + constexpr int32_t maxN0 = 64; + // in V220/V300, consider bias table buffer limit + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES / l0Status.dbL0C; + } + if (l0Status.maxAxisIdx != 0) { + // Major is n0 axis + if ((majorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + continue; + } + } else { + // Major is m0 axis + if ((minorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + continue; + } + } + + const int32_t k0Max = min(majorDimK / majorDimFactor, minorDimK / minorDimFactor); + std::vector k0Factors(DIM_FACTOR_LENGTH, 0); + GetTwoFactors(dimFactors, k0Max, coreStatus.k, k0Max); + k0Factors[0] = dimFactors[0]; + k0Factors[1] = dimFactors[1]; + const int32_t kAmend = GetBestValue(coreStatus.k); + GetTwoFactors(dimFactors, k0Max, kAmend, l0Status.maxAxisNum); + k0Factors[IDX_TWO] = dimFactors[0]; + k0Factors[IDX_THREE] = dimFactors[1]; + sort(k0Factors.rbegin(), k0Factors.rend()); + k0Factors.erase(unique(k0Factors.begin(), k0Factors.end()), k0Factors.end()); + for (auto& k0 : k0Factors) { + if (k0 == 0 || minorDimFactor * k0 > minorDimK || majorDimFactor * k0 > majorDimK) { + continue; + } + + // Check if the buffer size allocated exceed the hardware buffer size in Float Mode + if (tilingIns_->aType_.dataType == DataType::DT_FLOAT) { + int32_t mL0 = majorDimFactor; + int32_t nL0 = minorDimFactor; + if (l0Status.maxAxisIdx != 0) { + nL0 = majorDimFactor; + mL0 = minorDimFactor; + } + + const int32_t l0aBufferSize = tilingIns_->aType_.isTrans ? + MathUtil::Align(k0, 2) * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON : + k0 * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON; + const int32_t l0bBufferSize = (tilingIns_->aType_.isTrans || !tilingIns_->bType_.isTrans) ? + MathUtil::Align(k0, 2) * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON : + k0 * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON; + if (l0aBufferSize > tilingIns_->bufferPool_.l0ASize || l0bBufferSize > tilingIns_->bufferPool_.l0BSize) { + continue; + } + } else if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) || + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4)) { + int32_t mL0 = majorDimFactor; + int32_t nL0 = minorDimFactor; + if (l0Status.maxAxisIdx != 0) { + nL0 = majorDimFactor; + mL0 = minorDimFactor; + } + + const int32_t l0aBufferSize = tilingIns_->aType_.isTrans ? + k0 * C0_BYTE_SIZE * MathUtil::Align(mL0, 2) * C0_SIZE * DB_ON : + k0 * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON; + int32_t l0bBufferSize = (tilingIns_->bType_.isTrans) ? + k0 * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON : + k0 * C0_BYTE_SIZE * MathUtil::Align(nL0, 2) * C0_SIZE * DB_ON; + if (l0aBufferSize > tilingIns_->bufferPool_.l0ASize || + l0bBufferSize > tilingIns_->bufferPool_.l0BSize) { + continue; + } + } + GetFinalMkn(singleCoreStatus, coreStatus, k0, majorDimFactor, minorDimFactor); + } + } + } + if (l0Status.finalML0 != 0 && l0Status.finalKL0 != 0 && l0Status.finalNL0 != 0) { + SetResFactors(resFactors, l0Status); + } +} + +MKNParasCombo MatmulTilingAlgorithm::GetParasCombo(const int32_t& index, const MatmulRunParas& param) const +{ + (void)(param); + std::map parasComboMap; + const int32_t mnMax = tilingIns_->bufferPool_.l0CSize / (C0_SIZE * C0_SIZE) / FP32_BYTES; + int32_t maxN = 64; + // in V220/V300, consider bias table buffer limit + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES; + } + const bool biasBt = tilingIns_->isSupportL0c2Out && (tilingIns_->isBias); + const int32_t leftSize = min(tilingIns_->bufferPool_.l1Size, tilingIns_->bufferPool_.l0ASize / dbL0A_); + const int32_t rightSize = min(tilingIns_->bufferPool_.l1Size, tilingIns_->bufferPool_.l0BSize / dbL0B_); + const int32_t maxMk = tilingIns_->aType_.pos == TPosition::TSCM ? 64 : (leftSize / C0_SIZE / C0_BYTE_SIZE); + const int32_t maxNK = tilingIns_->bType_.pos == TPosition::TSCM ? 64 : (rightSize / C0_SIZE / C0_BYTE_SIZE); + // dbL0A, dbL0B, dbL0C, maxMk, maxNk, maxMn, maxAxisIdx, maxAxisNum, maxAxisPnt, maxN + MKNParasCombo comboZero = { 2, 2, 2, maxMk, maxNK, mnMax / DB_ON, 0, 64, 8, biasBt ? maxN / DB_ON : 64 }; + MKNParasCombo comboOne = { dbL0A_, dbL0B_, 1, maxMk, maxNK, mnMax, 0, 64, 11, biasBt ? maxN : 64 }; + parasComboMap = { { 0, comboZero }, { 1, comboOne } }; + + return parasComboMap[index]; +} + +void MatmulTilingAlgorithm::GetL0cDB(const L0Factors (&resFactors)[L0PARAS_COMBO_LEN], const CoreStatusPack& coreStatus, + L0StatusPack& l0Status) const +{ + const int32_t dbAOnBOnCOnIdx = 0; + const int32_t dbAOnBOnCOffIdx = 1; + // check both L0C utilization and loadsize to control LOC LOA LOB DB + const int32_t m0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalML0; + const int32_t k0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalKL0; + const int32_t n0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalNL0; + const int32_t loadSizeL0cDbOn = resFactors[dbAOnBOnCOnIdx].finalLoadSize; + const int32_t mte1CyclesL0cDbOn = resFactors[dbAOnBOnCOnIdx].finalMte1Cycles; + + const int32_t m0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalML0; + const int32_t k0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalKL0; + const int32_t n0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalNL0; + const int32_t loadSizeL0cDbOff = resFactors[dbAOnBOnCOffIdx].finalLoadSize; + const int32_t mte1CyclesL0cDbOff = resFactors[dbAOnBOnCOffIdx].finalMte1Cycles; + + const int32_t mte3CostDbOn = + m0L0cDbOn * n0L0cDbOn * MIN_FRACTAL_SIZE * FP16_BYTES * 1 / MTE1_FIXPIPE_BANDWIDTH; + const int32_t mte3CostDbOff = + m0L0cDbOff * n0L0cDbOff * MIN_FRACTAL_SIZE * FP16_BYTES * 1 / MTE1_FIXPIPE_BANDWIDTH; + + const int32_t madCylesDbOn = max(m0L0cDbOn * k0L0cDbOn * n0L0cDbOn, static_cast(mte1CyclesL0cDbOn * 0.7)); + const int32_t madCylesDbOff = + max(m0L0cDbOff * k0L0cDbOff * n0L0cDbOff, static_cast(mte1CyclesL0cDbOff * 0.7)); + int32_t dbOnPipeTime = MathUtil::CeilDivision(coreStatus.m, m0L0cDbOn) * + MathUtil::CeilDivision(coreStatus.n, n0L0cDbOn) * + ((MathUtil::CeilDivision(coreStatus.k, k0L0cDbOn) - 1) * madCylesDbOn + max(madCylesDbOn, mte3CostDbOn)); + int32_t dbOffPipeTime = MathUtil::CeilDivision(coreStatus.m, m0L0cDbOff) * + MathUtil::CeilDivision(coreStatus.n, n0L0cDbOff) * + (MathUtil::CeilDivision(coreStatus.k, k0L0cDbOff) * madCylesDbOff + mte3CostDbOff); + dbOnPipeTime = dbOnPipeTime == 0 ? INT32_MAX : dbOnPipeTime; + dbOffPipeTime = dbOffPipeTime == 0 ? INT32_MAX : dbOffPipeTime; + + if ((dbOffPipeTime < dbOnPipeTime) || (loadSizeL0cDbOff < loadSizeL0cDbOn)) { + l0Status.dbL0C = 1; + l0Status.dbL0A = dbL0A_; + l0Status.dbL0B = dbL0B_; + l0Status.mL0 = m0L0cDbOff; + l0Status.kL0 = k0L0cDbOff; + l0Status.nL0 = n0L0cDbOff; + } else { + l0Status.dbL0C = DB_ON; + l0Status.dbL0A = dbL0A_; + l0Status.dbL0B = dbL0B_; + l0Status.mL0 = m0L0cDbOn; + l0Status.kL0 = k0L0cDbOn; + l0Status.nL0 = n0L0cDbOn; + } +} + +void MatmulTilingAlgorithm::GetL0Factors(const std::string& opType, const MatmulRunParas& param, + const CoreStatusPack& coreStatus, SingleCoreStatus& singleCoreStatus) const +{ + (void)(opType); + // get mL0, nL0, kL0 factor when singlecore m, n, k is know + // mL0, nL0, kL0 is a factor of single core m, n, k + L0StatusPack& l0Status = singleCoreStatus.l0Status; + if (tilingIns_->isBias) { + l0Status.dtypeBias = DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + } + L0Factors resFactors[L0PARAS_COMBO_LEN]; + for (int32_t i = 0; i < L0PARAS_COMBO_LEN; ++i) { + if (i == 0 && g_tempCfg.l0cDB == DB_OFF) { + continue; + } + MKNParasCombo mknParasCombo = GetParasCombo(i, param); + for (int32_t j = 0; j < L0PARAS_COMBO_LEN; ++j) { + mknParasCombo.parasCombo[IDX_SIX] = j; + GetL0FactorsCand(resFactors[i], coreStatus, singleCoreStatus, mknParasCombo.parasCombo, param); + } + } + if (g_tempCfg.l0cDB == DB_OFF) { + l0Status.dbL0C = DB_OFF; + l0Status.dbL0A = dbL0A_; + l0Status.dbL0B = dbL0B_; + l0Status.mL0 = resFactors[1].finalML0; + l0Status.kL0 = resFactors[1].finalKL0; + l0Status.nL0 = resFactors[1].finalNL0; + } else { + GetL0cDB(resFactors, coreStatus, l0Status); + } +} + +bool MatmulTilingAlgorithm::IsNeedAlign(bool isA) const +{ + if (isA) { + return tilingIns_->aType_.dataType == DataType::DT_FLOAT || (tilingIns_->aType_.dataType == DataType::DT_INT8 && tilingIns_->aType_.isTrans); + } else { + return tilingIns_->bType_.dataType == DataType::DT_FLOAT || (tilingIns_->bType_.dataType == DataType::DT_INT8 && !tilingIns_->bType_.isTrans); + } +} + +int32_t MatmulTilingAlgorithm::GetL1Size(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t curAL1Size = 0; + int32_t curBL1Size = 0; + int32_t channelWiseL1Size = 0; + int32_t aL1Const = C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; + if (IsNeedAlign(true)) { + aL1Const *= NUM_TWO; + } + // 5/8 means 1/2(B Matrix size) + 1/8(Index Matrix size) + int32_t bL1Const = tilingIns_->isSparse_ ? C0_SIZE * (C0_BYTE_SIZE / 8) * 5 * l1Status.dbBL1 : + C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; + if (IsNeedAlign(false)) { + bL1Const *= NUM_TWO; + } + const int32_t channelWiseL1Const = l1Status.channelWiseTimes * C0_SIZE * l1Status.dbBL1 * l0Status.dtypeBias; + int32_t dequantSize = 0; + + int32_t kaAlignValue = 1; + int32_t kbAlignValue = 1; + GetABL1KAlignValue(kaAlignValue, kbAlignValue); + if (!MathUtil::CheckMulOverflow(l1Status.mAL1, l0Status.mL0, curAL1Size) || + !MathUtil::CheckMulOverflow(curAL1Size, aL1Const, curAL1Size) || + !MathUtil::CheckMulOverflow(curAL1Size, MathUtil::Align(l1Status.kAL1, kaAlignValue), curAL1Size)) { + return 0; + } + if (!MathUtil::CheckMulOverflow(l1Status.nBL1, l0Status.nL0, curBL1Size) || + !MathUtil::CheckMulOverflow(curBL1Size, bL1Const, curBL1Size) || + !MathUtil::CheckMulOverflow(curBL1Size, MathUtil::Align(l1Status.kBL1, kbAlignValue), curBL1Size)) { + return 0; + } + + if (l1Status.channelWiseTimes > 0) { + if (!MathUtil::CheckMulOverflow(l1Status.nBL1, l0Status.nL0, channelWiseL1Size) || + !MathUtil::CheckMulOverflow(channelWiseL1Size, channelWiseL1Const, channelWiseL1Size)) { + return 0; + } + } + + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + + if (tilingIns_->aType_.pos == TPosition::TSCM) { + curAL1Size = 0; + } + if (tilingIns_->bType_.pos == TPosition::TSCM) { + curBL1Size = 0; + } + if (tilingIns_->biasType_.pos == TPosition::TSCM) { + channelWiseL1Size = 0; + } + + const int64_t totalSize = static_cast(curAL1Size) + static_cast(curBL1Size) + + static_cast(channelWiseL1Size) + static_cast(dequantSize); + return totalSize > INT_MAX ? INT_MAX : static_cast(totalSize); +} + +int32_t MatmulTilingAlgorithm::CalL1MaxLen(int32_t resL1Size, L1StatusPack& l1Status, const L0StatusPack& l0Status, + const int32_t alignValue, const L1TilingType axisName) const +{ + int32_t axisMaxLen = 1; + if (axisName == L1TilingType::KAL1_16) { + axisMaxLen = resL1Size / + (l1Status.mAL1 * l0Status.mL0 * l1Status.dbAL1 * C0_SIZE * C0_BYTE_SIZE); + } + if (axisName == L1TilingType::KBL1_16) { + axisMaxLen = resL1Size / + (l1Status.nBL1 * l0Status.nL0 * l1Status.dbBL1 * C0_SIZE * C0_BYTE_SIZE); + } + axisMaxLen = MathUtil::AlignDown(axisMaxLen, alignValue); + if (axisName == L1TilingType::M_AL1) { + axisMaxLen = resL1Size / (MathUtil::Align(l1Status.kAL1, alignValue) * l0Status.mL0 * l1Status.dbAL1 * C0_SIZE * + C0_BYTE_SIZE); + } + if (axisName == L1TilingType::N_BL1) { + axisMaxLen = resL1Size / (MathUtil::Align(l1Status.kBL1, alignValue) * l0Status.nL0 * l1Status.dbBL1 * C0_SIZE * + C0_BYTE_SIZE + l1Status.channelWiseTimes * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE); + } + return axisMaxLen; +} + +/* + brief: + if factor greater then base, then factor = base + if factor less than base, then get thr max factor of base, i.e. base 10, factor 9, then res factor = 5 +*/ +void MatmulTilingAlgorithm::GetNearestFactor(const int32_t& base, int32_t& factor, int32_t capValue) const +{ + if (!g_tempCfg.factorSplit) { + return; + } + if (capValue == INT32_MAX) { + capValue = base; + } + while ((factor > capValue) || (factor > 0 && base % factor != 0)) { + factor--; + } +} + +void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const +{ + // if b martix in L1, then b matrix must full load, goto b matrix full load patch + if (tilingIns_->bType_.pos == TPosition::TSCM) { + return; + } + const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); + const int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); + int32_t kaAlignValue = 1; + int32_t kbAlignValue = 1; + GetABL1KAlignValue(kaAlignValue, kbAlignValue); + l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; + const int32_t curL1Size = GetL1Size(l1Status, l0Status); + const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); + if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && a1Length < tilingIns_->bufferPool_.ubSize) { + l1Status.aL1FullLoad = true; + l1Status.aL1Size = + max(MathUtil::Align(coreStatus.k, kaAlignValue), MathUtil::Align(l1Status.kAL1, kaAlignValue)) * + max(l1Status.mAL1 * l0Status.mL0, coreStatus.m) * C0_SIZE * C0_BYTE_SIZE; + if (tilingIns_->aType_.pos == TPosition::TSCM) { + l1Status.bL1Size = tilingIns_->bufferPool_.l1Size; + } else { + l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; + } + if (g_tempCfg.l1DB == DB_ON) { + l1Status.dbBL1 = DB_ON; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + l1Status.dbBL1 = DB_OFF; + } + } + const int32_t biasSize = + l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + l1Status.kBL1 = min(CalL1MaxLen((l1Status.bL1Size - biasSize - dequantSize), l1Status, l0Status, kbAlignValue, + L1TilingType::KBL1_16), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.dbBL1 = DB_OFF; + const int32_t b1Length = tilingIns_->bufferPool_.ubSize - a1Length; + l1Status.kBL1 = min(CalL1MaxLen(min(l1Status.bL1Size - biasSize - dequantSize, b1Length), + l1Status, l0Status, kbAlignValue, L1TilingType::KBL1_16), coreStatus.k); + } + l1Status.bL1Times = min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1); + GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); // tik-mm support no factor---ncheck + l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; + if (l1Status.kBL1 == coreStatus.k) { + l1Status.nBL1 = min(CalL1MaxLen(l1Status.bL1Size, l1Status, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Status.maxNBL1); + GetNearestFactor(nRepeat, l1Status.nBL1); + } + + const bool invalidL1Status = (l1Status.nBL1 == 0 || l1Status.kBL1 == 0) ? true : false; + const int32_t possibleMRepeat = (l1Status.kBL1 == coreStatus.k) ? 1 : mRepeat; + // m+n*m_repeat XXX---ncheck + l1Status.loadSize = invalidL1Status ? + INT32_MAX : + ((tilingIns_->aType_.pos == TPosition::TSCM ? 0 : coreStatus.m) + possibleMRepeat * coreStatus.n); + if (g_tempCfg.l1DB == DB_ON && l1Status.kBL1 == coreStatus.k && l1Status.nBL1 * l0Status.nL0 == coreStatus.n) { + l1Status.dbBL1 = DB_OFF; + } + res[IDX_ONE][IDX_ZERO] = l1Status.kAL1; + res[IDX_ONE][IDX_ONE] = l1Status.mAL1; + res[IDX_ONE][IDX_TWO] = l1Status.dbAL1; + res[IDX_ONE][IDX_THREE] = l1Status.kBL1; + res[IDX_ONE][IDX_FOUR] = l1Status.nBL1; + res[IDX_ONE][IDX_FIVE] = l1Status.dbBL1; + res[IDX_ONE][IDX_SIX] = l1Status.loadSize; + } +} + +void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const +{ + // if a martix in L1, then a matrix must full load, goto a matrix full load patch + if (tilingIns_->aType_.pos == TPosition::TSCM) { + return; + } + const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); + const int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); + int32_t kaAlignValue = 1; + int32_t kbAlignValue = 1; + GetABL1KAlignValue(kaAlignValue, kbAlignValue); + l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; + const int32_t curL1Size = GetL1Size(l1Status, l0Status); + const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); + if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && b1Length < tilingIns_->bufferPool_.ubSize) { + l1Status.bL1FullLoad = true; + l1Status.bL1Size = + max(MathUtil::Align(coreStatus.k, kbAlignValue), MathUtil::Align(l1Status.kBL1, kbAlignValue)) * + max(l1Status.nBL1 * l0Status.nL0, coreStatus.n) * C0_SIZE * C0_BYTE_SIZE; + if (tilingIns_->bType_.pos == TPosition::TSCM) { + l1Status.aL1Size = tilingIns_->bufferPool_.l1Size; + } else { + l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + } + if (g_tempCfg.l1DB == DB_ON) { + l1Status.dbAL1 = DB_ON; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + l1Status.dbAL1 = DB_OFF; + } + } + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + const int32_t biasSize = + l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; + l1Status.kAL1 = min(CalL1MaxLen((l1Status.aL1Size - biasSize - dequantSize), l1Status, l0Status, kaAlignValue, + L1TilingType::KAL1_16), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.dbAL1 = DB_OFF; + const int32_t a1Length = tilingIns_->bufferPool_.ubSize - b1Length; + l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), + l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); + } + l1Status.aL1Times = min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1); + GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor---ncheck + l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; + if (l1Status.kAL1 == coreStatus.k) { + l1Status.mAL1 = + min(CalL1MaxLen(l1Status.aL1Size - biasSize, l1Status, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Status.maxMAL1); + GetNearestFactor(mRepeat, l1Status.mAL1); // tik-mm support no factor---ncheck + } + + const bool invalidL1Status = (l1Status.mAL1 == 0 || l1Status.kAL1 == 0) ? true : false; + const int32_t possibleNRepeat = (l1Status.kAL1 == coreStatus.k) ? 1 : nRepeat; // no repeat---ncheck + l1Status.loadSize = invalidL1Status ? + INT32_MAX : + ((tilingIns_->bType_.pos == TPosition::TSCM ? 0 : coreStatus.n) + possibleNRepeat * coreStatus.m); + if (g_tempCfg.l1DB == DB_ON && l1Status.kAL1 == coreStatus.k && l1Status.mAL1 * l0Status.mL0 >= coreStatus.m) { + l1Status.dbAL1 = DB_OFF; + } + res[IDX_TWO][IDX_ZERO] = l1Status.kAL1; + res[IDX_TWO][IDX_ONE] = l1Status.mAL1; + res[IDX_TWO][IDX_TWO] = l1Status.dbAL1; + res[IDX_TWO][IDX_THREE] = l1Status.kBL1; + res[IDX_TWO][IDX_FOUR] = l1Status.nBL1; + res[IDX_TWO][IDX_FIVE] = l1Status.dbBL1; + res[IDX_TWO][IDX_SIX] = l1Status.loadSize; + } +} + +void MatmulTilingAlgorithm::L1StatusBothFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const +{ + l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; + l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; + const int32_t curL1Size = GetL1Size(l1Status, l0Status); + const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); + const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); + if (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM) { + l1Status.mAL1 = 1; + l1Status.nBL1 = 1; + } + if (((curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) && + (a1Length + b1Length) <= tilingIns_->bufferPool_.ubSize) || + (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM)) { + l1Status.bothFullLoad = true; + l1Status.loadSize = (tilingIns_->aType_.pos == TPosition::TSCM ? 0 : coreStatus.m) + + (tilingIns_->bType_.pos == TPosition::TSCM ? 0 : coreStatus.n); + res[IDX_ZERO][IDX_ZERO] = l1Status.kAL1; + res[IDX_ZERO][IDX_ONE] = l1Status.mAL1; + res[IDX_ZERO][IDX_TWO] = l1Status.dbAL1; + res[IDX_ZERO][IDX_THREE] = l1Status.kBL1; + res[IDX_ZERO][IDX_FOUR] = l1Status.nBL1; + res[IDX_ZERO][IDX_FIVE] = l1Status.dbBL1; + res[IDX_ZERO][IDX_SIX] = l1Status.loadSize; + } +} +void MatmulTilingAlgorithm::NeitherFullLoadDb(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, const int32_t& kbl1Db) const +{ + const int32_t tmpKbl116 = l1Status.kBL1; + l1Status.kBL1 = kbl1Db; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize) { + l1Status.dbBL1 = DB_OFF; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { + l1Status.dbAL1 = DB_OFF; + } + } + l1Status.kBL1 = coreStatus.k; + const bool bothDoubleBuffer = coreStatus.m != l0Status.mL0 && coreStatus.k > l0Status.kL0 && + (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize); + l1Status.kBL1 = tmpKbl116; + if (bothDoubleBuffer) { + l1Status.dbAL1 = DB_ON; + l1Status.dbBL1 = DB_ON; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize) { + l1Status.dbBL1 = DB_OFF; + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { + l1Status.dbAL1 = DB_OFF; + } + } + } +} \ No newline at end of file -- Gitee From c90f93e2f370be96e82de8202cc640569b9fce63 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:10:26 +0000 Subject: [PATCH 44/56] add Signed-off-by: jiangchengcheng-on --- .../tiling/matmul_tiling_algorithm_new.cpp | 661 ++++++++++++++++++ 1 file changed, 661 insertions(+) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp index aaa75b30..70db7cfc 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp +++ b/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp @@ -892,4 +892,665 @@ void MatmulTilingAlgorithm::NeitherFullLoadDb(const CoreStatusPack& coreStatus, } } } +} +void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status) const +{ + const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); + int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); + if (l0Status.dtypeBias == FP32_BYTES && l1Status.channelWiseTimes > 0) { + l1Status.channelWiseTimes++; + } + int32_t biasSize = l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * FP16_BYTES * l1Status.dbBL1; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + // Align value is used in FP32 in FP32 out data flow mode + int32_t kaAlignValue = 1; + int32_t kbAlignValue = 1; + GetABL1KAlignValue(kaAlignValue, kbAlignValue); + L1StatusPack l1Mfirst; + L1StatusPack l1Nfirst; + errno_t err = + memcpy_s(static_cast(&l1Mfirst), sizeof(l1Mfirst), static_cast(&l1Status), sizeof(l1Mfirst)); + if (err != EOK) { + TILING_LOG_ERROR("memcpy error"); + return; + } + err = memcpy_s(static_cast(&l1Nfirst), sizeof(l1Nfirst), static_cast(&l1Status), sizeof(l1Nfirst)); + if (err != EOK) { + TILING_LOG_ERROR("memcpy error"); + } + // default l1Status.nBL1 = 1 + // calculate M first condition + l1Mfirst.bL1Size = MathUtil::Align(l1Mfirst.kBL1, kbAlignValue) * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * + l1Mfirst.dbBL1; + l1Mfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Mfirst, l0Status); + l1Mfirst.mAL1 = max(min(min( + CalL1MaxLen(l1Mfirst.aL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Mfirst.maxMAL1), + mRepeat), + 1); + if (IsUbNd2Nz()) { + l1Mfirst.mAL1 = max(min(min( + CalL1MaxLen(min(l1Mfirst.aL1Size - biasSize - dequantSize, a1Length), l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Mfirst.maxMAL1), + mRepeat), + 1); + } + GetNearestFactor(mRepeat, l1Mfirst.mAL1); // tik-mm support no factor ----ncheck + l1Mfirst.aL1Size = MathUtil::Align(l1Mfirst.kAL1, kaAlignValue) * l1Mfirst.mAL1 * l0Status.mL0 * C0_SIZE * + C0_BYTE_SIZE * l1Mfirst.dbAL1; + l1Mfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.aL1Size; + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Mfirst, l0Status); + l1Mfirst.nBL1 = max(min(min( + CalL1MaxLen(l1Mfirst.bL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Mfirst.maxNBL1), + nRepeat), + 1); + if (IsUbNd2Nz()) { + l1Mfirst.nBL1 = max(min(min( + CalL1MaxLen(min(l1Mfirst.bL1Size - biasSize - dequantSize, b1Length), l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Mfirst.maxNBL1), + nRepeat), + 1); + } + GetNearestFactor(nRepeat, l1Mfirst.nBL1); + l1Mfirst.loadSize = + coreStatus.m + coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); + + // calculate N first condition + l1Nfirst.aL1Size = MathUtil::Align(l1Nfirst.kAL1, kaAlignValue) * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * + l1Nfirst.dbAL1; + l1Nfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.aL1Size; + b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Nfirst, l0Status); + l1Nfirst.nBL1 = max(min(min( + CalL1MaxLen(l1Nfirst.bL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Nfirst.maxNBL1), + nRepeat), + 1); + if (IsUbNd2Nz()) { + l1Nfirst.nBL1 = max(min(min( + CalL1MaxLen(min(l1Nfirst.bL1Size - biasSize - dequantSize, b1Length), l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Nfirst.maxNBL1), + nRepeat), + 1); + } + GetNearestFactor(nRepeat, l1Nfirst.nBL1); + l1Nfirst.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Nfirst.nBL1 * l0Status.nL0 * C0_SIZE * + C0_BYTE_SIZE * l1Nfirst.dbBL1; + l1Nfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.bL1Size; + a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); + biasSize = biasSize * l1Nfirst.nBL1; + l1Nfirst.mAL1 = max(min(min( + CalL1MaxLen(l1Nfirst.aL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Nfirst.maxMAL1), + mRepeat), + 1); + if (IsUbNd2Nz()) { + l1Nfirst.mAL1 = max(min(min( + CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Nfirst.maxMAL1), + mRepeat), + 1); + } + GetNearestFactor(mRepeat, l1Nfirst.mAL1); + l1Nfirst.loadSize = + coreStatus.m * MathUtil::CeilDivision(coreStatus.n, l1Nfirst.nBL1 * l0Status.nL0) + coreStatus.n; + + if (l1Status.kAL1 >= coreStatus.k && l1Status.kBL1 >= coreStatus.k) { + if (l1Nfirst.loadSize > l1Mfirst.loadSize) { + const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Mfirst, sizeof(l1Status)); + if (errnoT != EOK) { + TILING_LOG_ERROR("memcpy error"); + return; + } + } else { + const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Nfirst, sizeof(l1Status)); + if (errnoT != EOK) { + TILING_LOG_ERROR("memcpy error"); + return; + } + } + } + if (l1Status.kAL1 >= coreStatus.k && l1Status.kBL1 < coreStatus.k) { + l1Mfirst.nBL1 = 1; + const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Mfirst, sizeof(l1Status)); + if (errnoT != EOK) { + TILING_LOG_ERROR("memcpy error"); + return; + } + } + if (l1Status.kAL1 < coreStatus.k && l1Status.kBL1 >= coreStatus.k) { + l1Nfirst.mAL1 = 1; + const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Nfirst, sizeof(l1Status)); + if (errnoT != EOK) { + TILING_LOG_ERROR("memcpy error"); + return; + } + } + if (l1Status.kAL1 < coreStatus.k && l1Status.kBL1 < coreStatus.k) { + l1Status.mAL1 = 1; + l1Status.nBL1 = 1; + l1Status.loadSize = coreStatus.m * MathUtil::CeilDivision(coreStatus.n, l1Mfirst.nBL1 * l0Status.nL0) + + coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); + } +} + +void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status) const +{ + l1Status.kBL1 = coreStatus.k; + const int32_t biasSize = + l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + int32_t kaAlignValue = 1; + int32_t kbAlignValue = 1; + GetABL1KAlignValue(kaAlignValue, kbAlignValue); + + if (GetL1Size(l1Status, l0Status) > 0 && GetL1Size(l1Status, l0Status) <= tilingIns_->bufferPool_.l1Size) { + l1Status.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * + C0_BYTE_SIZE * l1Status.dbBL1; + l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); + l1Status.kAL1 = min(CalL1MaxLen(l1Status.aL1Size - biasSize - dequantSize, l1Status, l0Status, kaAlignValue, + L1TilingType::KAL1_16), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), l1Status, l0Status, kaAlignValue, + L1TilingType::KAL1_16), + coreStatus.k); + } + + l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); + GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); + l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; + } else { + // when NeitherFullLoadMN change the nBL1 and mAL1 + int32_t perK = min((tilingIns_->bufferPool_.l1Size - biasSize - dequantSize) / + (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / + l0Status.kL0 * l0Status.kL0, + coreStatus.k); + if (IsUbNd2Nz()) { + perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, + tilingIns_->bufferPool_.ubSize) / + (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / + l0Status.kL0 * l0Status.kL0, + coreStatus.k); + } + const int32_t biasFactor = tilingIns_->isBias ? l1Status.nBL1 * l0Status.nL0 : 0; + const int32_t aAlignedPerK = MathUtil::Align(perK, kaAlignValue); + const int32_t bAlignedPerK = MathUtil::Align(perK, kbAlignValue); + if (tilingIns_->aType_.dataType == DataType::DT_FLOAT && + !CheckL1Size(l1Status.mAL1 * l0Status.mL0 * aAlignedPerK * l1Status.dbAL1, + l1Status.nBL1 * l0Status.nL0 * bAlignedPerK * l1Status.dbBL1, + biasFactor * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1 + dequantSize)) { + perK -= 1; + } + int32_t perTimes = min(perK / l0Status.kL0, max(l1Status.maxKAL1, l1Status.maxKBL1)); + GetNearestFactor(l1Status.allTimes, perTimes); + perTimes = min(perTimes, l1Status.allTimes); + perK = perTimes * l0Status.kL0; + l1Status.kAL1 = perK; + l1Status.kBL1 = perK; + } +} + +bool MatmulTilingAlgorithm::CheckL1Size(int32_t amat, int32_t bmat, int32_t curBiasL1Size) const +{ + const int64_t loadSizeBytes = (static_cast(amat + bmat) * C0_SIZE * C0_BYTE_SIZE + + static_cast(curBiasL1Size)); + return loadSizeBytes <= tilingIns_->bufferPool_.l1Size; +} + +void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, const int32_t& kMaxAxis) const +{ + int32_t biasSize = + l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; + } + int32_t alignValue = FP32_ALIGN_SIZE; + if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { + alignValue = INT8_ALIGN_SIZE; + } else if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4)) { + alignValue = INT4_ALIGN_SIZE; + } + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + const int32_t alignM = MathUtil::CeilDivision(l1Status.mAL1 * C0_SIZE, alignValue) * alignValue; + const int32_t alignN = MathUtil::CeilDivision(l1Status.nBL1 * C0_SIZE, alignValue) * alignValue; + const int32_t alignK = MathUtil::CeilDivision(l0Status.kL0 * reduceSize, alignValue) * alignValue * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (kMaxAxis == 1) { + // first get k_al1, second get k_bl1 + l1Status.kBL1 = l0Status.kL0; + if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || + (tilingIns_->aType_.isTrans && DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8))) { + l1Status.bL1Size = l1Status.kBL1 * l0Status.nL0 * C0_SIZE * alignK * l1Status.nBL1 * l1Status.dbBL1; + } else if (!tilingIns_->bType_.isTrans && (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) + || DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4))) { + l1Status.bL1Size = l1Status.kBL1 * l0Status.nL0 * alignK * alignN * l1Status.dbBL1; + } else { + l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; + } + l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); + auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } + + l1Status.aL1Times = max(l1Status.kAL1 / l0Status.kL0, 1); + GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor ----ncheck + l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; + l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; + l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); + if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || (tilingIns_->aType_.isTrans && + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8))) { + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), + coreStatus.k); + } + } else if (!tilingIns_->bType_.isTrans && + (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) || + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4))) { + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), + coreStatus.k); + } + } else { + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize)/ + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length)/ + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k);} + } + l1Status.bL1Times = max(min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1), 1); + GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); + l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; + } + if (kMaxAxis == NUM_TWO) { + // first get k_bl1, second get k_al1 + l1Status.kAL1 = l0Status.kL0; + if ((tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) || + (!tilingIns_->aType_.isTrans && DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8))) { + l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * alignK * l1Status.dbAL1; + } else if (tilingIns_->aType_.isTrans && + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { + l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; + } else { + l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; + } + + l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k); + } + l1Status.bL1Times = max(l1Status.kBL1 / l0Status.kL0, 1); + GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); + l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; + l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; + l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); + if ((tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) || + (!tilingIns_->aType_.isTrans && (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) || + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT4)))) { + auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * alignK; + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } + } else if (tilingIns_->aType_.isTrans && DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { + l1Status.kAL1 = min((l1Status.aL1Size - biasSize - dequantSize) / + (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / + (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); + } + l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; + } else { + auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } + } + l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); + GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); + l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; + } +} +void MatmulTilingAlgorithm::NeitherFullLoadK(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status) const +{ + if (l0Status.kL0 == coreStatus.k) { + return; + } + // 1 -> let k_al1 bigger, 2 -> let k_bl1 bigger, 0 -> no matter + int32_t kMaxAxis = 0; + if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { + kMaxAxis = 1; + } + if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { + kMaxAxis = 2; + } + if (!tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { + kMaxAxis = l0Status.mL0 > l0Status.nL0 ? 1 : 2; + } + // Not Support FP32 mode for NZ format and hardware with pre_ub + if (kMaxAxis != 0) { + NeitherFullLoadKforND(coreStatus, l0Status, l1Status, kMaxAxis); + } else { + NeitherFullLoadKforNZ(coreStatus, l0Status, l1Status); + } + if (g_tempCfg.factorSplit) { + if (l1Status.kAL1 > l1Status.kBL1 && l1Status.kAL1 % l1Status.kBL1 != 0) { + while (l1Status.kAL1 % l1Status.kBL1 != 0 || + (l1Status.kAL1 != l1Status.kBL1 && coreStatus.k % l1Status.kAL1 != 0)) { + l1Status.kAL1 -= 1; + } + } + if (l1Status.kAL1 < l1Status.kBL1 && l1Status.kBL1 % l1Status.kAL1 != 0) { + while (l1Status.kBL1 % l1Status.kAL1 != 0 || + (l1Status.kAL1 != l1Status.kBL1 && coreStatus.k % l1Status.kBL1 != 0)) { + l1Status.kBL1 -= 1; + } + } + } +} + +void MatmulTilingAlgorithm::L1StatusNeitherFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, + L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const +{ + // if b martix in L1, then b matrix must full load, skip non-full process + if (tilingIns_->aType_.pos == TPosition::TSCM || tilingIns_->bType_.pos == TPosition::TSCM) { + return; + } + if (g_tempCfg.l1DB == DB_ON) { + NeitherFullLoadDb(coreStatus, l0Status, l1Status, DB_ON); + } + NeitherFullLoadK(coreStatus, l0Status, l1Status); + NeitherFullLoadMN(coreStatus, l0Status, l1Status); + + res[IDX_THREE][IDX_ZERO] = l1Status.kAL1; + res[IDX_THREE][IDX_ONE] = l1Status.mAL1; + res[IDX_THREE][IDX_TWO] = l1Status.dbAL1; + res[IDX_THREE][IDX_THREE] = l1Status.kBL1; + res[IDX_THREE][IDX_FOUR] = l1Status.nBL1; + res[IDX_THREE][IDX_FIVE] = l1Status.dbBL1; + res[IDX_THREE][IDX_SIX] = l1Status.loadSize; +} + +void MatmulTilingAlgorithm::GetL1Factors(const std::string& opType, const MatmulRunParas& param, + const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status) const +{ + (void)(opType); + (void)(param); + // get mAL1, nBL1, kAL1, kBL1 factors when L0, singlecore factor is know + // get al1, bl1 double buffer factors + const int32_t mte1Loop = MIN_MTE1_LOAD / + ((l0Status.nL0 == 1 ? 1 : l0Status.kL0) + (l0Status.kL0 == 1 ? 1 : l0Status.mL0)); + int32_t res[IDX_FOUR][IDX_SEVEN] = {0}; + l1Status.allTimes = MathUtil::CeilDivision(coreStatus.k, l0Status.kL0); + l1Status.maxMAL1 = (coreStatus.m + l0Status.mL0 - 1) / l0Status.mL0; + l1Status.maxNBL1 = (coreStatus.n + l0Status.nL0 - 1) / l0Status.nL0; + l1Status.maxKAL1 = + max(mte1Loop, ((MIN_MTE1_LOAD + l0Status.mL0 - 1) / l0Status.mL0 + l0Status.kL0 - 1) / l0Status.kL0); + l1Status.maxKBL1 = + max(mte1Loop, ((MIN_MTE1_LOAD + l0Status.nL0 - 1) / l0Status.nL0 + l0Status.kL0 - 1) / l0Status.kL0); + if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + l1Status.channelWiseTimes++; + } + // both AL1 and Bl1 full load + int32_t bothFullLoadFactors[L1_FACTORS_LEN] = { coreStatus.k, coreStatus.k, l1Status.maxMAL1, + l1Status.maxNBL1, DB_OFF, DB_OFF }; + // Need to consider L1 extension in FP32 Mode + l1Status.SetStatus(bothFullLoadFactors); + L1StatusBothFullLoad(coreStatus, l0Status, l1Status, res); + // only AL1 full load + int32_t al1FullLoadFactors[L1_FACTORS_LEN] = {coreStatus.k, l0Status.kL0, l1Status.maxMAL1, 1, DB_OFF, DB_OFF}; + l1Status.SetStatus(al1FullLoadFactors); + L1StatusAl1FullLoad(coreStatus, l0Status, l1Status, res); + // only BL1 full load + int32_t bl1FullLoadFactors[L1_FACTORS_LEN] = {l0Status.kL0, coreStatus.k, 1, l1Status.maxNBL1, DB_OFF, DB_OFF}; + l1Status.SetStatus(bl1FullLoadFactors); + L1StatusBl1FullLoad(coreStatus, l0Status, l1Status, res); + // neither AL1 nor Bl1 full load + res[IDX_THREE][IDX_SIX] = INT_MAX; + int32_t neitherFullLoadFactors[L1_FACTORS_LEN] = {l0Status.kL0, l0Status.kL0, 1, 1, DB_ON, DB_ON}; + l1Status.SetStatus(neitherFullLoadFactors); + L1StatusNeitherFullLoad(coreStatus, l0Status, l1Status, res); + // choose the final factors + int32_t* tmpFactors = res[IDX_THREE]; + int32_t tmpLoadSize = tmpFactors[IDX_SIX]; + int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + const int32_t kAl1FactorOne = res[IDX_ONE][IDX_ZERO] > 0 ? MathUtil::CeilDivision( + MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ONE][IDX_ZERO])) : + 1; + const int32_t kBl1FactorTwo = res[IDX_TWO][IDX_THREE] > 0 ? MathUtil::CeilDivision( + MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_TWO][IDX_THREE])) : + 1; + const int32_t kAl1FactorZero = res[IDX_ZERO][IDX_ZERO] > 0 ? MathUtil::CeilDivision( + MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ZERO][IDX_ZERO])) : + 1; + const int32_t kBl1FactorZero = res[IDX_ZERO][IDX_THREE] > 0 ? MathUtil::CeilDivision( + MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ZERO][IDX_THREE])) : + 1; + + const bool al1FullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? + (l1Status.aL1FullLoad && kAl1FactorOne == 1) : + l1Status.aL1FullLoad; + const bool bl1FullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? + (l1Status.bL1FullLoad && kBl1FactorTwo == 1) : + l1Status.bL1FullLoad; + const bool bothFullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? + (l1Status.bothFullLoad && kAl1FactorZero == 1 && kBl1FactorZero == 1) : + l1Status.bothFullLoad; + if (al1FullLoad && (res[IDX_ONE][IDX_SIX] < tmpLoadSize || (res[IDX_ONE][IDX_SIX] == tmpLoadSize && + res[IDX_ONE][IDX_ONE] + res[IDX_ONE][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { + tmpFactors = res[IDX_ONE]; + tmpLoadSize = tmpFactors[IDX_SIX]; + TILING_LOG_DEBUG("Select Mode One."); + } + if (bl1FullLoad && (res[IDX_TWO][IDX_SIX] < tmpLoadSize || (res[IDX_TWO][IDX_SIX] == tmpLoadSize && + res[IDX_TWO][IDX_ONE] + res[IDX_TWO][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { + tmpFactors = res[IDX_TWO]; + tmpLoadSize = tmpFactors[IDX_SIX]; + TILING_LOG_DEBUG("Select Mode Two."); + } + if (bothFullLoad && (res[IDX_ZERO][IDX_SIX] < tmpLoadSize || (res[IDX_ZERO][IDX_SIX] == tmpLoadSize && + res[IDX_ZERO][IDX_ONE] + res[IDX_ZERO][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { + tmpFactors = res[IDX_ZERO]; + TILING_LOG_DEBUG("Select Mode Zero."); + } + int32_t resL1Factors[L1_FACTORS_LEN] = {tmpFactors[IDX_ZERO], tmpFactors[IDX_THREE], tmpFactors[IDX_ONE], + tmpFactors[IDX_FOUR], tmpFactors[IDX_TWO], tmpFactors[IDX_FIVE]}; + l1Status.SetStatus(resL1Factors); +} + +void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32_t& ubSize, + int32_t a1LengthCache, int32_t b1LengthCache) const +{ + const uint32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + const uint32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); + const uint32_t cTypeSize = DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); + const uint32_t biasTypeSize = DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + + const int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * aTypeSize / BITS_PER_BYTE; + const int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * bTypeSize / BITS_PER_BYTE; + const int32_t c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * FP32_BYTES; + + if (tilingIns_->aType_.pos != TPosition::TSCM) { + l1Size += tilingIns_->tiling_.get_depthA1() * a1Length; + if (tilingIns_->enableL1CacheUB && tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + l1Size += tilingIns_->tiling_.get_depthAL1CacheUB() * a1LengthCache; + } + } + if (tilingIns_->bType_.pos != TPosition::TSCM) { + l1Size += tilingIns_->tiling_.get_depthB1() * b1Length; + if (tilingIns_->enableL1CacheUB && tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + l1Size += tilingIns_->tiling_.get_depthBL1CacheUB() * b1LengthCache; + } + } + + l0cSize += c1Length; + + if (static_cast(tilingIns_->tiling_.get_isBias())) { + if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && + tilingIns_->biasType_.pos != TPosition::TSCM) { + // for ascend910b1 bias: gm -> l1 -> bt, need extra l1 space, support bias transform + l1Size += tilingIns_->tiling_.get_baseN() * biasTypeSize; + } + } + + // in v100/v200, nd2nz and nz2nd was simulated with intrins, need extra ub space + // in V300, nd2nz was simulated with intrins, need extra ub space + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + // case2: input ND(GM/VECCALC), ND -> NZ transform, for now A/B reuse, only process with tail block, need UB space + // (1) input GM, format is ND, need do zero-fill to non-aligned tail block in ub + // (2) input VECCALC, format is ND, need do zero-fill to non-aligned tail block in ub + int32_t aUbLength = 0; + int32_t bUbLength = 0; + if (!tilingIns_->aType_.isTrans && ((tilingIns_->tiling_.get_singleCoreK() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { + aUbLength = tilingIns_->tiling_.get_baseM() * C0_BYTE_SIZE; + } + if (tilingIns_->aType_.isTrans && + ((tilingIns_->tiling_.get_singleCoreM() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { + aUbLength = tilingIns_->tiling_.get_baseK() * C0_BYTE_SIZE; + } + + if (!tilingIns_->bType_.isTrans && ((tilingIns_->tiling_.get_singleCoreN() * bTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { + bUbLength = tilingIns_->tiling_.get_baseK() * C0_BYTE_SIZE; + } + if (tilingIns_->bType_.isTrans && + ((tilingIns_->tiling_.get_singleCoreK() * bTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { + bUbLength = tilingIns_->tiling_.get_baseN() * C0_BYTE_SIZE; + } + if (tilingIns_->aType_.pos == TPosition::TSCM) { + aUbLength = 0; + } + if (tilingIns_->bType_.pos == TPosition::TSCM) { + bUbLength = 0; + } + + if ((tilingIns_->aType_.type == CubeFormat::ND || tilingIns_->bType_.type == CubeFormat::ND)) { + ubSize += max(aUbLength, bUbLength); + } + + // V300 only needs extra buffer when INPUT are at GM/UB. + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + return; + } + + // case3: output GM/VECCAL, format is ND, for now not re-use input and ouput non-aligned, is related with db open + // (1) output GM, format is NZ/ND_ALIGN/ND, need restore in ub, ND and D is non-aligned , then add more 32B, ub->gm NZ->ND format and data type trnasform + // (2) output VECCALC,format is ND_ALIGN/ND(D alined), need doNZ->ND transform in ub + if (tilingIns_->cType_.pos == TPosition::GM) { + ubSize += tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseN() * cTypeSize; + if (tilingIns_->cType_.type == CubeFormat::ND && + (tilingIns_->tiling_.get_singleCoreN() * cTypeSize) % C0_BYTE_SIZE != 0) { + ubSize += C0_BYTE_SIZE; + } + } + if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type != CubeFormat::NZ) { + ubSize += tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseN() * cTypeSize; + } + // for V200/V100, if setquanttensor and output is not nd, need extra ubsize for copy tensor from gm to ub + if (tilingIns_->deqType == DequantType::TENSOR && tilingIns_->cType_.type == CubeFormat::NZ) { + ubSize += static_cast(tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(DataType::DT_UINT64)); + } + } + return; +} + +void MatmulTilingAlgorithm::GetBankConflictSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status, + int32_t& length, bool isAMatrix) const +{ + constexpr int blockSize = 32; + constexpr int bankLen = 512; + bool isBankConflict = false; + int bankConflictSize = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (isAMatrix) { + if (tilingIns_->aType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(l1Status.mAL1 * l0Status.mL0 * C0_SIZE, C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * + MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0 * reduceSize, + C0_SIZE) * blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.mL0 * C0_SIZE * C0_SIZE * l1Status.mAL1 * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } + } else { + if (tilingIns_->bType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0 * reduceSize, + C0_SIZE) * blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.nL0 * C0_SIZE * C0_SIZE * l1Status.nBL1 * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(l1Status.nBL1 * l0Status.nL0 * C0_SIZE, C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * + MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } + } + if (isBankConflict) { + length = length + bankConflictSize; + } } \ No newline at end of file -- Gitee From 8d1d1fdad0a139f0dcd05e79fef35183ed76aab5 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:12:50 +0000 Subject: [PATCH 45/56] add2 Signed-off-by: jiangchengcheng-on --- .../tiling/matmul_tiling_algorithm_new2.cpp | 676 ++++++++++++++++++ 1 file changed, 676 insertions(+) create mode 100644 impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp new file mode 100644 index 00000000..b1717f8e --- /dev/null +++ b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp @@ -0,0 +1,676 @@ +void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) const +{ + constexpr int blockSize = 32; + constexpr int bankLen = 512; + bool isBankConflict = false; + int bankConflictSize = 0; + if (isAMatrix) { + if (tilingIns_->aType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepM() * tilingIns_->tiling_.get_baseM(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKa() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseM() * C0_SIZE * tilingIns_->tiling_.get_stepM() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } + } else { + if (tilingIns_->bType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseN() * C0_SIZE * tilingIns_->tiling_.get_stepN() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepN() * tilingIns_->tiling_.get_baseN(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKb() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } + } + if (isBankConflict) { + length = length + bankConflictSize; + } +} + +int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t a1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // A matrix ND2NZ + if (tilingIns_->aType_.type == CubeFormat::ND) { + a1Length = l0Status.mL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + a1Length = a1Length * MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l1Status.mAL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, a1Length, true); + } + } + return a1Length; +} + +int32_t MatmulTilingAlgorithm::GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t b1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // B matrix ND2NZ + if (tilingIns_->bType_.type == CubeFormat::ND) { + b1Length = l0Status.nL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + b1Length = b1Length * MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l1Status.nBL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, b1Length, false); + } + } + return b1Length; +} + +bool MatmulTilingAlgorithm::IsUbNd2Nz() const +{ + if (tilingIns_->enVecND2NZ && tilingIns_->mmConfigType == 1 && + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + return true; + } + return false; +} + +void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const +{ + int32_t a1Length = 0; + int32_t b1Length = 0; + int32_t c1Length = 0; + int32_t biasLength = 0; + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + // A matrix ND2NZ + if (tilingIns_->aType_.type == CubeFormat::ND) { + a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + a1Length = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); + } + // bank conflict + GetBankConflictSize(a1Length, true); + } + // B matrix ND2NZ + if (tilingIns_->bType_.type == CubeFormat::ND + || (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) && + tilingIns_->bType_.type == CubeFormat::NZ && tilingIns_->bType_.isTrans == false)) { + b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + b1Length = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); + } + // bank conflict + GetBankConflictSize(b1Length, false); + } + // C matrix NZ2ND + if (tilingIns_->cType_.type == CubeFormat::ND || tilingIns_->cType_.pos == TPosition::GM) { + c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * + DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); + } + // Bias + if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) { + biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + } + // quant tensor + if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { + int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t); + biasLength = max(quantLength, biasLength); + } + } + + transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); +} + +bool MatmulTilingAlgorithm::CheckBaseMN() const +{ + // check bias table + if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && + tilingIns_->isBias && (tilingIns_->baseN > MAX_BIAS_N * C0_SIZE) && tilingIns_->isSupportL0c2Out) { + return false; + } + if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { + return (tilingIns_->baseM * tilingIns_->baseN * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize && + tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); + } + if (tilingIns_->baseM != -1) { + return (tilingIns_->baseM * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize); + } + if (tilingIns_->baseN != -1) { + return (tilingIns_->baseN * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); + } + return true; +} + +int32_t MatmulTilingAlgorithm::GetIteratorOrder(const SingleCoreStatus& singleCoreStatus, const int32_t singleCoreM, + const int32_t singleCoreN, const int32_t singleCoreK) const +{ + if (tilingIns_->traverse_ != MatrixTraverse::NOSET) { + return static_cast(tilingIns_->traverse_) - 1; + } + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + const bool fullkAL1Load = + (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kAL1 * reduceSize)) > 1.0 ? false : true; + bool fullkBL1Load = + (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kBL1 * reduceSize)) > 1.0 ? false : true; + + // if KAL1 and KBL1 both can not be full loaded, then select m or n which is no matter + if (!fullkAL1Load && !fullkBL1Load) { + return static_cast(MatrixTraverse::FIRSTM) - 1; + } else if (fullkAL1Load && !fullkBL1Load) { // if KAL1 is full loaded, then select the order N fist + return static_cast(MatrixTraverse::FIRSTN) - 1; + } else if (!fullkAL1Load && fullkBL1Load) { // if KBL1 is full loaded, then select the order M fist + return static_cast(MatrixTraverse::FIRSTM) - 1; + } else { + // if AL1LoadSize less then BL1LoadSize, then select order N first, vice versa. + const int32_t mLoop = MathUtil::CeilDivision(singleCoreM, + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l0Status.mL0 * C0_SIZE); + const int32_t nLoop = MathUtil::CeilDivision(singleCoreN, + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l0Status.nL0 * C0_SIZE); + const int32_t aL1LoadSize = singleCoreM + singleCoreN * mLoop; + const int32_t bL1LoadSize = singleCoreN + singleCoreM * nLoop; + return aL1LoadSize < bL1LoadSize ? 1 : 0; + } +} + +void MatmulTilingAlgorithm::UpdateBlockDimCalculator(BlockDimCalculator& blockDimRes) const +{ + if (blockDimRes.totalLoadSize > blockDimRes.tmpLoadSize) { + blockDimRes.bmatSize = blockDimRes.tmpBmatSize; + blockDimRes.amatSize = blockDimRes.tmpAmatSize; + blockDimRes.totalLoadSize = blockDimRes.tmpLoadSize; + blockDimRes.tmpValue = 0; + } +} + +void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes, const MatmulRunParas& params) const +{ + blockDimRes.totalLoadSize = INT_MAX; + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + const int32_t totalSize = blockDimRes.amatSize + blockDimRes.bmatSize; // batch==1 + constexpr int32_t minMNSize = 16; + constexpr int32_t minKSize = 64; + constexpr int32_t minTotalSize = 128; + const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); + const int32_t k0 = (m0 != 0 && n0 != 0) ? + min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; + const int32_t dbBuffer = 2; + + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + // loadsize = K*(N*mdim+M*ndim) + const bool bothFullLoad = static_cast(totalSize) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + const bool afulloadPlsBKFullLoad = + static_cast(blockDimRes.amatSize + n0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + const bool bfulloadPlsaKFullLoad = + static_cast(blockDimRes.bmatSize + m0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + if (afulloadPlsBKFullLoad || bfulloadPlsaKFullLoad || bothFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A kdim not fullload + B kdim not fullload(9) + // loadsize = M*K*N*(1/m0+1/n0) + const bool aKNotfulloadPlsbKNotFullLoad = + (n0 * blockDimRes.kBytes + m0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > + tilingIns_->bufferPool_.l1Size && + (m0 * blockDimRes.kBytes + n0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > + tilingIns_->bufferPool_.l1Size; + if (aKNotfulloadPlsbKNotFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A kdim fullload + B kdim fullload(5) + // M*K*(ndim+N/m1) or N*K*(mdim+M/n1) + const bool aKfulloadPlsbKFullLoad = (m0 + n0) * blockDimRes.kBytes * dbBuffer <= tilingIns_->bufferPool_.l1Size; + if (aKfulloadPlsbKFullLoad) { + const int32_t m1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - n0 * + blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * m0)) * m0; + const int32_t n1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - m0 * + blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * n0)) * n0; + const int32_t mfirstLoad = + blockDimRes.oriAmatSize * blockDims.n + blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); + int32_t nfirstLoad = + blockDimRes.oriBmatSize * blockDims.m + blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); + if (mfirstLoad < nfirstLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); + } else { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; + } + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A fullload + B Kdim not fullload or A K fullload + B Kdim not fullload(3/6) + // mdim = coreNum; ndim = 1; + // loadsize = M*K*(ndim+N/m0) + const bool afulloadPlsbKNotFullLoad = (blockDimRes.amatSize * blockDimRes.kBytes + + n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + const bool aKfulloadPlsbKNotFullLoad = (m0 * blockDimRes.kBytes * dbBuffer + + n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + if (afulloadPlsbKNotFullLoad || aKfulloadPlsbKNotFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + } + + // A kdim not fullload + B fullload or A kdim not fullload + B kdim fullload(7/8) + // loadsize = N*K*(mdim+M/n0) + const bool aKNotfulloadPlsbFullLoad = (blockDimRes.bmatSize * blockDimRes.kBytes + + m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + const bool aKNotfulloadPlsbKFullLoad = (n0 * blockDimRes.kBytes * dbBuffer + + m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + if (aKNotfulloadPlsbFullLoad || aKNotfulloadPlsbKFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriBmatSize * blockDims.m; + blockDimRes.tmpBmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + } +} + +int32_t MatmulTilingAlgorithm::LoopNumFromSingleCoreToL0(const CoreStatusPack& coreStatus, + const DimFactor& blockDimsFactor) const +{ + if (!blockDimsFactor.IsValid()) { + return 0; + } + constexpr int32_t minTotalSize = 128; + constexpr int32_t minSize = 64; + constexpr int32_t minN0Size = 16; + int32_t n0 = min(min(minN0Size, coreStatus.n), minSize); + int32_t m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); + n0 = (m0 == 0) ? 0 : min(min(coreStatus.n, minTotalSize / m0), minSize); + m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); + const int32_t k0 = (m0 != 0 && n0 != 0) ? + min(min(minSize / m0, minSize / n0), coreStatus.k) : coreStatus.k; + const int32_t loopNum = MathUtil::CeilDivision(coreStatus.m, m0) * MathUtil::CeilDivision(coreStatus.n, n0) * + MathUtil::CeilDivision(coreStatus.k, k0); + return loopNum; +} + +int32_t MatmulTilingAlgorithm::GetBigPackageCondition(const CoreStatusPack &coreStatus, + const BlockDimCalculator &blockDimRes, const MatmulRunParas ¶ms) const +{ + if (tilingIns_->bType_.isTrans == true && tilingIns_->aType_.isTrans == false) { + return ATTACH_FLAG_ZERO; + } + const int minSize = 16; + bool flag = true; + if (tilingIns_->bType_.isTrans == false) { + if (params.n32 >= minSize && coreStatus.n < minSize) { + flag = false; + } + } + if (tilingIns_->aType_.isTrans) { + if (params.m32 >= minSize && coreStatus.m < minSize) { + flag = false; + } + } + + if (!blockDimRes.bigPackage && !flag) { + return ATTACH_FLAG_ZERO; + } else if (!blockDimRes.bigPackage && flag) { + return ATTACH_FLAG_TWO; + } else if (blockDimRes.bigPackage && !flag) { + return ATTACH_FLAG_ONE; + } else { + return ATTACH_FLAG_ZERO; + } +} + +void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes, const MatmulRunParas& params) +{ + blockDimRes.kNum = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE); // contain k * 16 + blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 + coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); + coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); + coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); + coreStatus.k = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k); + if (tilingIns_->enableSplitK_) { + if (params.kMapped != params.k32) { // need check--splitK + blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; + coreStatus.k = params.kMapped / blockDim.k * NUM_TWO; + } + } + + // load size of A matrix is batch * m + // load size of B matrix is n + blockDimRes.oriAmatSize = params.batch32 * params.m32; + blockDimRes.oriBmatSize = params.oriShapeBbatch > 1 ? params.batch32 * params.n32 : params.n32; + blockDimRes.amatSize = coreStatus.batch * coreStatus.m; + blockDimRes.bmatSize = params.oriShapeBbatch > 1 ? coreStatus.batch * coreStatus.n : coreStatus.n; + blockDimRes.tmpValue = 0; + CalcLoadSize(blockDim, coreStatus, blockDimRes, params); + if (tilingIns_->enableSplitK_) { + blockDimRes.totalLoadSize *= coreStatus.k; + } + + // updateSolution: bool whether update to a new block factor solution + // has smaller LoadSize or the same LoadSize but batch + const int bigpackageFlag = GetBigPackageCondition(coreStatus, blockDimRes, params); + const bool updateConditionBp = bigpackageFlag == 0 ? false : true; + bool updateConditionBp2 = bigpackageFlag == 2 ? true : false; + bool updateConditionBp3 = bigpackageFlag == 1 ? false : true; + + const int32_t loopNum = LoopNumFromSingleCoreToL0(coreStatus, blockDim); + const bool updateConditionCoreUsed = (!updateConditionBp) && ((loopNum < blockDimRes.loopNumToL0) || + (blockDim.ReduceMul() > blockDimRes.coreUse && loopNum == blockDimRes.loopNumToL0)); + const bool updateConditionLoadsize = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse) && + blockDimRes.totalLoadSize < blockDimRes.minLoadSize; + const int32_t orgBatchM = params.oriShapeAbatch > 1 ? blockDimRes.batchDimFactor : blockDimRes.mDimFactor; + const int32_t curBatchM = params.oriShapeAbatch > 1 ? blockDim.batch : blockDim.m; + const bool updateConditionBatchNDim = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse && + blockDimRes.totalLoadSize == blockDimRes.minLoadSize) && + ((blockDimRes.nDimFactor * orgBatchM < curBatchM * blockDim.n) || + (blockDimRes.nDimFactor * orgBatchM == curBatchM * blockDim.n && + blockDimRes.batchDimFactor < blockDim.batch)); + + const bool policyCondition = + UserPolicy(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, + coreStatus, blockDimRes); + if ((updateConditionBp2 || updateConditionCoreUsed || updateConditionLoadsize || updateConditionBatchNDim) && + policyCondition && updateConditionBp3) { + blockDimRes.minLoadSize = blockDimRes.totalLoadSize; + blockDimRes.nDimFactor = blockDim.n; + blockDimRes.batchDimFactor = blockDim.batch; + blockDimRes.mDimFactor = blockDim.m; + blockDimRes.kDimFactor = blockDim.k; + blockDimRes.coreUse = blockDim.ReduceMul(); + blockDimRes.loopNumToL0 = loopNum; + blockDimRes.finalValue = blockDimRes.tmpValue; + const int32_t minSize = 16; + blockDimRes.bigPackage = (!tilingIns_->bType_.isTrans ? coreStatus.n >= minSize : true) && + (tilingIns_->aType_.isTrans ? coreStatus.m >= minSize : true) && (blockDim.n * blockDim.m * blockDim.k > 1); + splitCoreFlag_ = true; + } +} + +bool MatmulTilingAlgorithm::UserPolicy(const TilingPolicy policy, const CoreStatusPack& coreStatus, + const BlockDimCalculator& blockDimRes) const +{ + constexpr int32_t minMNSize = 16; + constexpr int32_t minKSize = 64; + constexpr int32_t minTotalSize = 128; + const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); + const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; + + if (policy == TilingPolicy::FIXED_B_TSCM) { + const int32_t alignFactor = MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE); + if (coreStatus.n < alignFactor) { + return false; + } + const int32_t alignNLength = MathUtil::Align(coreStatus.n, alignFactor); + const int32_t bMatrixSize = alignNLength * blockDimRes.kBytes * 2; + int32_t aMatrixSize = m0 * k0 * C0_SIZE * C0_BYTE_SIZE; + int32_t biasSize = 0; + if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + biasSize = alignNLength * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + } + if (bMatrixSize + aMatrixSize + biasSize <= tilingIns_->bufferPool_.l1Size) { + return true; + } else { + return false; + } + } else if (policy == TilingPolicy::FIXED_A_TSCM) { + return false; + } else if (policy == TilingPolicy::FIXED_A_B_TSCM) { + return false; + } else { + return true; + } +} + +bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreStatusPack& coreStatus, + MatmulRunParas& params, const int32_t& coreNum, bool splitKFlag) const +{ + (void)(opType); + // experience value for mini shape + const int32_t miniL0cThreshold = tilingIns_->bufferPool_.l0CSize / MIN_FRACTAL_SIZE / FP32_BYTES; + const int32_t miniL0abThreshold = tilingIns_->bufferPool_.l0ASize / (C0_SIZE * C0_BYTE_SIZE); + // tend to use less cores for shapes with batch less than coreNum and m/k/n can full load in + // aicore buffers split_k is conflict with m/n shift_inwards + bool specialScenario = false; + if (params.n32 > MIN_MTE1_LOAD) { + specialScenario = specialScenario || + (splitKFlag && ((static_cast(params.nMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); + } + if (params.m32 > MIN_MTE1_LOAD) { + specialScenario = specialScenario || + (splitKFlag && ((static_cast(params.mMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); + } + + if (params.batch32 * params.n32 * params.m32 <= coreNum && params.m32 * params.k32 <= miniL0abThreshold && + params.n32 * params.k32 <= miniL0abThreshold && params.m32 * params.n32 <= miniL0cThreshold && + !specialScenario) { + coreStatus.batchDim = params.batch32; + coreStatus.nDim = params.n32 <= MIN_MTE1_LOAD ? 1 : params.nMapped / MIN_MTE1_LOAD; + coreStatus.mDim = params.m32 <= MIN_MTE1_LOAD ? 1 : params.mMapped / MIN_MTE1_LOAD; + int32_t kDimCandidate[2] = {0}; // storage 2 factors of k around kDim + GetTwoFactors(kDimCandidate, coreStatus.kDim, params.k32, coreNum); + coreStatus.kDim = (params.k32 <= MIN_MTE1_LOAD || !splitKFlag) ? + 1 : + (kDimCandidate[1] > 1 ? kDimCandidate[1] : kDimCandidate[0]); + coreStatus.batch = 1; + coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); + coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); + coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); + params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); + return true; + } + return false; +} +float MatmulTilingAlgorithm::CalculateBlockCycles(int32_t baseM, int32_t baseN, int32_t baseK) const +{ + const int32_t reduceBlockSize = C0_BYTE_SIZE * BITS_PER_BYTE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + return static_cast(baseM * baseN * baseK) / (C0_SIZE * C0_SIZE * reduceBlockSize); +} + +int32_t MatmulTilingAlgorithm::CalculateMemoryTraffic(int32_t baseM, int32_t baseN, int32_t baseK) const +{ + int32_t aMatrixSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + int32_t bMatrixSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + return aMatrixSize + bMatrixSize; +} + +bool MatmulTilingAlgorithm::AlignSingleShape(bool needAlign, int32_t orgShape, int32_t factor, int32_t alignSize, + int32_t &singleShape) const +{ + singleShape = MathUtil::CeilDivision(orgShape, factor); + if (!needAlign || alignSize == 0 || orgShape % alignSize != 0) { + return true; // orgShape not align, don't need to adjust + } + if (factor <= 1) { + return true; + } + int32_t maxSingleShape = MathUtil::CeilDivision(orgShape, factor - 1); + int32_t alignSingleShape = MathUtil::Align(singleShape, alignSize); + if (alignSingleShape >= maxSingleShape) { + return false; + } + singleShape = alignSingleShape; + return true; +} + +ComputeBaseBlock MatmulTilingAlgorithm::GetMultiCoreBasicBlock(const MatmulRunParas& params) const +{ + (void)params; + constexpr static int32_t l0c256KB = 262144; + constexpr static int32_t basicSize128 = 128; + constexpr static int32_t basicSize256 = 256; + int32_t basicM = basicSize128; + if (tilingIns_->bufferPool_.l0CSize == l0c256KB) { + basicM = basicSize256; + } + int32_t basicN = basicSize256; + int32_t aDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) != 0 ? + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) : 1; + int32_t basicK = basicSize128 * BITS_PER_BYTE / aDtypeSize; + ComputeBaseBlock basicBlock {basicM, basicN, basicK}; + // SetFixSplit + if (tilingIns_->baseM != -1) { + basicBlock.baseM = tilingIns_->baseM; + } + if (tilingIns_->baseN != -1) { + basicBlock.baseN = tilingIns_->baseN; + } + if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { + return basicBlock; + } + if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { + basicBlock.baseM = tilingIns_->baseM != -1 ? basicBlock.baseM : basicSize256; + basicBlock.baseN = tilingIns_->baseN != -1 ? basicBlock.baseN : basicSize128; + return basicBlock; + } + + return basicBlock; +} + +float MatmulTilingAlgorithm::CalcBaseBlockBandRatio(int32_t mDim, int32_t nDim, const ComputeBaseBlock &baseBlock) const +{ + float bandRatio = static_cast((numOfBlock_ - mDim) * baseBlock.baseM + (numOfBlock_ - nDim) * baseBlock.baseN) / + static_cast((baseBlock.baseM + baseBlock.baseN) * numOfBlock_); + return bandRatio; +} + +ComputeIntensity MatmulTilingAlgorithm::CalcComputeIntensity(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + const std::pair &factor) const +{ + auto mFactor = factor.first; + auto nFactor = factor.second; + int32_t sm = 0; + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + bool alignSuccA = AlignSingleShape(tilingIns_->aType_.isTrans, params.oriShapeM, mFactor, aAlignSize, sm); + int32_t sn = 0; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool alignSuccB = AlignSingleShape(!tilingIns_->bType_.isTrans, params.oriShapeN, nFactor, bAlignSize, sn); + auto shapeM = MathUtil::DivideIntoMainAndTail(sm, baseBlock.baseM); + auto shapeN = MathUtil::DivideIntoMainAndTail(sn, baseBlock.baseN); + auto mainM = shapeM.first; + auto tailM = shapeM.second; + auto mainN = shapeN.first; + auto tailN = shapeN.second; + int32_t memoryRatio = (alignSuccA && alignSuccB) ? 1 : 2; + float bandRatio = CalcBaseBlockBandRatio(mFactor, nFactor, baseBlock); + std::vector blocks; + // Main Chunk + if (mainM > 0 && mainN > 0) { + int count = mainM * mainN; + float cycles = CalculateBlockCycles(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; + int32_t memory = memoryRatio * + CalculateMemoryTraffic(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; + blocks.push_back({count, cycles, memory}); + } + // N Tail Chunk + if (mainM > 0 && tailN > 0) { + float cycles = CalculateBlockCycles(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; + int32_t memory = memoryRatio * CalculateMemoryTraffic(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; + blocks.push_back({mainM, cycles, memory}); + } + // M Tail Chunk + if (tailM > 0 && mainN > 0) { + float cycles = CalculateBlockCycles(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; + int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; + blocks.push_back({mainN, cycles, memory}); + } + // M and N Tail Chunk + if (tailM > 0 && tailN > 0) { + float cycles = CalculateBlockCycles(tailM, tailN, baseBlock.baseK); + int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, tailN, baseBlock.baseK); + blocks.push_back({1, cycles, memory}); + } + float totalCycles = 0; + int32_t totalMemory = 0; + for (const auto& v : blocks) { + totalCycles += v.computeCycle; + totalMemory += v.memoryTraffic; + } + return { + {mFactor, nFactor}, totalCycles, (totalMemory != 0) ? totalCycles / totalMemory : 0, bandRatio}; +} + +MultiCoreScenario MatmulTilingAlgorithm::GetMultiCoreScenario(const MatmulRunParas& params) const +{ + if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + return MultiCoreScenario::OTHERS; + } + if (tilingIns_->enableSplitK_ || tilingIns_->singleM != -1 || tilingIns_->singleN != -1) { + return MultiCoreScenario::OTHERS; + } + constexpr int64_t mnLimit = 26214; // 128 * 256 * 0.8 + constexpr int64_t mLimit = 128; + if (params.oriShapeM >= mLimit && params.oriShapeM * params.oriShapeN > mnLimit * numOfBlock_) { + return MultiCoreScenario::SPLIT_MN; + } + return MultiCoreScenario::OTHERS; +} + +void MatmulTilingAlgorithm::UpdateStepK(const ComputeBaseBlock &baseBlock, int32_t &stepK) const +{ + if (stepK * baseBlock.baseK >= GetSingleK()) { + return; + } + constexpr static int32_t baseBlockSize512 = 512; + constexpr static int32_t baseBlockSize256 = 256; + int32_t aTypeBitSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize512) { + if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0) && + (baseBlockSize512 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { + while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0 && stepK > 1) { + stepK--; + } + } + } else if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize256) { + if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0) && + (baseBlockSize256 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { + while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0 && stepK > 1) { + stepK--; + } + } + } +} \ No newline at end of file -- Gitee From 128614dd37caf3b9bf137f089c73877ea9952de1 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:13:16 +0000 Subject: [PATCH 46/56] add2 Signed-off-by: jiangchengcheng-on --- .../tiling/matmul_tiling_algorithm_new2.cpp | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp index b1717f8e..13fb7754 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp +++ b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp @@ -673,4 +673,230 @@ void MatmulTilingAlgorithm::UpdateStepK(const ComputeBaseBlock &baseBlock, int32 } } } +} + +void MatmulTilingAlgorithm::CalcL1Tiling(const ComputeBaseBlock &baseBlock, int32_t &depthA1, int32_t &depthB1, + int32_t &stepKa, int32_t &stepKb) +{ + int32_t l1Size = tilingIns_->bufferPool_.l1Size; + constexpr static int32_t reservedL1Size = 256; // l1 reserved 256B + int32_t depthA1Size = (l1Size / DB_ON / baseBlock.baseM / baseBlock.baseK) * BITS_PER_BYTE / + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t depthB1Size = ((l1Size + reservedL1Size) / DB_ON / baseBlock.baseN / baseBlock.baseK) * BITS_PER_BYTE / + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); + int32_t btSize = tilingIns_->isBias ? tilingIns_->bufferPool_.btSize / BITS_PER_BYTE : 0; + if (depthA1Size + depthB1Size > l1Size - btSize) { + if (baseBlock.baseM <= baseBlock.baseN) { + depthA1Size = depthA1Size / DB_ON; + } else { + depthB1Size = depthB1Size / DB_ON; + } + } + int32_t l1Db = g_tempCfg.l1DB == DB_OFF ? DB_OFF : DB_ON; + stepKa = depthA1Size / l1Db; + stepKb = depthB1Size / l1Db; + UpdateStepK(baseBlock, stepKa); + UpdateStepK(baseBlock, stepKb); + if (stepKa >= stepKb && stepKb != 0) { + stepKa = stepKa / stepKb * stepKb; + } else if (stepKa != 0) { + stepKb = stepKb / stepKa * stepKa; + } + depthA1 = stepKa * l1Db; + depthB1 = stepKb * l1Db; +} + +L0StatusPack MatmulTilingAlgorithm::GetL0CoreStatus(const ComputeBaseBlock &baseBlock) const +{ + L0StatusPack l0Status; + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + l0Status.dbL0C = g_tempCfg.l0cDB; + if (baseBlock.baseM * baseBlock.baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { + l0Status.dbL0C = DB_OFF; + } + l0Status.dbL0A = DB_ON; + l0Status.dbL0B = DB_ON; + l0Status.mL0 = baseBlock.baseM / C0_SIZE; + l0Status.kL0 = baseBlock.baseK / reduceSize; + l0Status.nL0 = baseBlock.baseN / C0_SIZE; + return l0Status; +} + +L1StatusPack MatmulTilingAlgorithm::GetL1CoreStatus(const ComputeBaseBlock &baseBlock, int32_t depthA1, int32_t depthB1, + int32_t stepKa, int32_t stepKb) const +{ + L1StatusPack l1Status; + l1Status.mAL1 = 1; + l1Status.nBL1 = 1; + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + l1Status.kAL1 = baseBlock.baseK / reduceSize * stepKa; + l1Status.kBL1 = baseBlock.baseK / reduceSize * stepKb; + l1Status.dbAL1 = depthA1 >= stepKa * DB_ON ? DB_ON : DB_OFF; + l1Status.dbBL1 = depthB1 >= stepKb * DB_ON ? DB_ON : DB_OFF; + return l1Status; +} + +void MatmulTilingAlgorithm::UpdateShapeAndLayout() const +{ + tilingIns_->tiling_.set_M(tilingIns_->orgM); + tilingIns_->tiling_.set_N(tilingIns_->orgN); + tilingIns_->tiling_.set_Ka(tilingIns_->orgKa); + tilingIns_->tiling_.set_Kb(tilingIns_->orgKb); + tilingIns_->tiling_.set_batchM(tilingIns_->batchM); + tilingIns_->tiling_.set_batchN(tilingIns_->batchN); + tilingIns_->tiling_.set_singleBatchM(tilingIns_->singleBatchM); + tilingIns_->tiling_.set_singleBatchN(tilingIns_->singleBatchN); + + tilingIns_->tiling_.set_ALayoutInfoB(tilingIns_->aLayoutInfoB); + tilingIns_->tiling_.set_ALayoutInfoS(tilingIns_->aLayoutInfoS); + tilingIns_->tiling_.set_ALayoutInfoN(tilingIns_->aLayoutInfoN); + tilingIns_->tiling_.set_ALayoutInfoG(tilingIns_->aLayoutInfoG); + tilingIns_->tiling_.set_ALayoutInfoD(tilingIns_->aLayoutInfoD); + + tilingIns_->tiling_.set_BLayoutInfoB(tilingIns_->bLayoutInfoB); + tilingIns_->tiling_.set_BLayoutInfoS(tilingIns_->bLayoutInfoS); + tilingIns_->tiling_.set_BLayoutInfoN(tilingIns_->bLayoutInfoN); + tilingIns_->tiling_.set_BLayoutInfoG(tilingIns_->bLayoutInfoG); + tilingIns_->tiling_.set_BLayoutInfoD(tilingIns_->bLayoutInfoD); + + tilingIns_->tiling_.set_CLayoutInfoB(tilingIns_->cLayoutInfoB); + tilingIns_->tiling_.set_CLayoutInfoS1(tilingIns_->cLayoutInfoS1); + tilingIns_->tiling_.set_CLayoutInfoN(tilingIns_->cLayoutInfoN); + tilingIns_->tiling_.set_CLayoutInfoG(tilingIns_->cLayoutInfoG); + tilingIns_->tiling_.set_CLayoutInfoS2(tilingIns_->cLayoutInfoS2); + tilingIns_->tiling_.set_BatchNum(tilingIns_->batchNum); + return; +} + +void MatmulTilingAlgorithm::UpdateUsedSize() const +{ + int32_t transLength = 0; + GetTransLength(transLength); + int32_t a1LengthCache = 0; + int32_t b1LengthCache = 0; + SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); + tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub + tilingIns_->tiling_.set_shareMode(0); + int32_t l1Size = 0; + int32_t l0cSize = 0; + int32_t ubSize = 0; + GetUsedSize(l1Size, l0cSize, ubSize, a1LengthCache, b1LengthCache); + tilingIns_->tiling_.set_shareL1Size(l1Size); + tilingIns_->tiling_.set_shareL0CSize(l0cSize); + tilingIns_->tiling_.set_shareUbSize(ubSize); +} + +int64_t MatmulTilingAlgorithm::AdjustOuterProductL0Factor(const SingleCoreStatus& singleCoreStatus) const +{ + if (tilingIns_->scheduleType != ScheduleType::OUTER_PRODUCT) { + return 0; + } + // check whether OUTER_PRODUCT is supported + if ((tilingIns_->tiling_.get_baseK() < tilingIns_->tiling_.get_singleCoreK()) && + ((tilingIns_->mmConfigType == 1) || ((tilingIns_->mmConfigType == 0) && + (tilingIns_->batchNum != 0)))) { + TILING_LOG_WARNING("Unsupported scheduleType is OUTER_PRODUCT"); + return -1L; + } + int32_t newBaseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t newBaseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + // when scheduleType is OUTER_PRODUCT, each iteration computes 2 * basicBlock size of data + bool isL0CFullUsed = (newBaseM * newBaseN * NUM_TWO * + static_cast(DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType))) > + static_cast(tilingIns_->bufferPool_.l0CSize) ? true : false; + if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 0)) { + // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_M, N db in L0 + newBaseN = MathUtil::Align(newBaseN / NUM_TWO, C0_SIZE); + } else if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 1)) { + // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_N, M db in L0 + newBaseM = MathUtil::Align(newBaseM / NUM_TWO, C0_SIZE); + } + tilingIns_->tiling_.set_baseM(newBaseM); + tilingIns_->tiling_.set_baseN(newBaseN); + return 0; +} + +void MatmulTilingAlgorithm::AdjustFloatL1Factor(const SingleCoreStatus& singleCoreStatus) const +{ + if (DTYPE_BYTE_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BYTE_TAB.at(DataType::DT_FLOAT)) { + if (tilingIns_->tiling_.get_baseK() == DT_FLOAT_INVALID_BASEK) { + tilingIns_->tiling_.set_stepKb(1); + tilingIns_->tiling_.set_depthB1(singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); + } + } +} + +int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) +{ + int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : + coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; + int32_t singleCoreM; + int32_t singleCoreN; + int32_t singleCoreK; + GetSingleShape(coreStatus, param, singleCoreM, singleCoreN, singleCoreK); + if (!CheckSingleShape(singleCoreM, singleCoreN, singleCoreK)) { + return -1L; + } + tilingIns_->tiling_.set_usedCoreNum(coreUse); + tilingIns_->tiling_.set_singleCoreM(singleCoreM); + tilingIns_->tiling_.set_singleCoreN(singleCoreN); + tilingIns_->tiling_.set_singleCoreK(singleCoreK); + UpdateShapeAndLayout(); + tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE); + tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE); + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); + tilingIns_->tiling_.set_iterateOrder(GetIteratorOrder(singleCoreStatus, singleCoreM, singleCoreN, singleCoreK)); + // check whether OUTER_PRODUCT is supported + if (AdjustOuterProductL0Factor(singleCoreStatus) != 0) { + return -1L; + } + tilingIns_->baseM = tilingIns_->tiling_.get_baseM(); + tilingIns_->baseN = tilingIns_->tiling_.get_baseN(); + tilingIns_->baseK = tilingIns_->tiling_.get_baseK(); + AdjustMxL1Factors(singleCoreStatus, reduceSize); + int32_t mxTypePara = 0; + GetMxScaleFactor(singleCoreStatus, reduceSize, mxTypePara); + tilingIns_->tiling_.set_mxTypePara(mxTypePara); + tilingIns_->tiling_.set_depthA1( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); + tilingIns_->tiling_.set_depthB1(UpdateDepthB1(singleCoreStatus)); + // if decrease depthB1, nBL1 must decrease to ensure nBL1 is less then depthB1 + singleCoreStatus.l1Status.nBL1 = min(singleCoreStatus.l1Status.nBL1, tilingIns_->tiling_.get_depthB1()); + tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); + tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); + tilingIns_->tiling_.set_stepKa( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); + tilingIns_->tiling_.set_stepKb( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); + AdjustFloatL1Factor(singleCoreStatus); + tilingIns_->tiling_.set_isBias(tilingIns_->isBias ? 1 : 0); + tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); + tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); + tilingIns_->tiling_.set_dbL0C(singleCoreStatus.l0Status.dbL0C); + UpdateUsedSize(); + return 0; +} + +bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& params, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes) +{ + if (GetMultiCoreScenario(params) != MultiCoreScenario::SPLIT_MN) { + return false; + } + ComputeBaseBlock baseBlock = GetMultiCoreBasicBlock(params); // calc basic block + CalcMultiCoreBlockDims(params, baseBlock, coreStatus, blockDimRes); + SingleCoreStatus singleCoreStatus; + singleCoreStatus.l0Status = GetL0CoreStatus(baseBlock); + AdjustSparseL0Factors(singleCoreStatus); + AdjustMxL0Factors(singleCoreStatus); + int32_t depthA1; + int32_t depthB1; + int32_t stepKa; + int32_t stepKb; + CalcL1Tiling(baseBlock, depthA1, depthB1, stepKa, stepKb); + singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); + (void)UpdateTiling(params, coreStatus, singleCoreStatus); + return true; } \ No newline at end of file -- Gitee From 44be6025e60c7dfe1218406823c67eb0ce9d58d8 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:13:33 +0000 Subject: [PATCH 47/56] add2 Signed-off-by: jiangchengcheng-on --- .../tiling/matmul_tiling_algorithm_new2.cpp | 751 +++++++++++++++++- 1 file changed, 750 insertions(+), 1 deletion(-) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp index 13fb7754..8a81abcc 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp +++ b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp @@ -899,4 +899,753 @@ bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& param singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); (void)UpdateTiling(params, coreStatus, singleCoreStatus); return true; -} \ No newline at end of file +} + +bool MatmulTilingAlgorithm::NeedOutputAlign(int32_t m, int32_t n, int32_t k) const +{ + int32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); + int32_t cTypeSize = DTYPE_BIT_TAB.at(tilingIns_->cType_.dataType); + constexpr static int32_t outputRatio = 2; + bool needAlign = static_cast(n * m) * static_cast(outputRatio * cTypeSize) > + static_cast(n * k* aTypeSize) + static_cast(m * k * bTypeSize); + return needAlign; +} + +void MatmulTilingAlgorithm::CalcMultiCoreBlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes) +{ + auto factors = MathUtil::GetFactorPairs(numOfBlock_); + std::vector results; + for (const auto& factor : factors) { + results.push_back(CalcComputeIntensity(params, baseBlock, factor)); + } + // 排序结果 + std::sort(results.begin(), results.end()); + for (auto v : results) { + TILING_LOG_DEBUG("intent:%f, cycle: %f, band: %f, mDim: %d, nDim: %d\n", + v.avgIntensity, v.computeCycle, v.bandRatio, v.dimFactor.first, v.dimFactor.second); + } + coreStatus.batchDim = 1; + blockDimRes.nDimFactor = results[0].dimFactor.second; + blockDimRes.mDimFactor = results[0].dimFactor.first; + blockDimRes.kDimFactor = 1; + coreStatus.mDim = results[0].dimFactor.first; + coreStatus.nDim = results[0].dimFactor.second; + coreStatus.kDim = 1; + const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); + const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool needOutputAlign = NeedOutputAlign(m, n, GetSingleK()); + (void)AlignSingleShape((!tilingIns_->bType_.isTrans || needOutputAlign), n, coreStatus.nDim, bAlignSize, coreStatus.n); + (void)AlignSingleShape(tilingIns_->aType_.isTrans, m, coreStatus.mDim, aAlignSize, coreStatus.m); + blockDimRes.kNum = params.k32 / coreStatus.kDim * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 + blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 + coreStatus.batch = params.batch32; + coreStatus.k = params.k32 / coreStatus.kDim; + TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d", coreStatus.m, coreStatus.n, coreStatus.k); + // load size of A matrix is batch * m + // load size of B matrix is n + DimFactor blockDim(1, blockDimRes.mDimFactor, blockDimRes.kDimFactor, blockDimRes.nDimFactor); + GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); + return; +} + +void MatmulTilingAlgorithm::UpdateMultiCore(const std::string& opType, const MatmulRunParas& params, + CoreStatusPack& coreStatus, const BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. + coreStatus.batchDim = min(MathUtil::CeilDivision(params.batch32, coreStatus.batch), numOfBlock_); + coreStatus.nDim = min(MathUtil::CeilDivision(params.n32, coreStatus.n), numOfBlock_); + coreStatus.mDim = min(MathUtil::CeilDivision(params.m32, coreStatus.m), numOfBlock_); + + if (tilingIns_->enableSplitK_) { + coreStatus.kDim = min(MathUtil::CeilDivision(params.k32, coreStatus.k), numOfBlock_); + } else { + coreStatus.kDim = blockDimRes.kDimFactor; + } + UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, + coreStatus); +} + +void MatmulTilingAlgorithm::UpdateBufferSize(const TilingPolicy policy, const CoreStatusPack& coreStatus) const +{ + if (policy == TilingPolicy::NO_POLICY) { + return; + } else if (policy == TilingPolicy::FIXED_B_TSCM) { + const int32_t bMatrixSize = + MathUtil::Align(coreStatus.n, MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE)) * coreStatus.k * + C0_SIZE * C0_BYTE_SIZE * 2; + tilingIns_->bufferPool_.l1Size -= bMatrixSize; + } else if (policy == TilingPolicy::FIXED_A_TSCM) { + const int32_t aMatrixSize = coreStatus.m * coreStatus.k * C0_SIZE * C0_BYTE_SIZE * 2; + tilingIns_->bufferPool_.l1Size -= aMatrixSize; + } else { + return; + } +} + +bool MatmulTilingAlgorithm::IsInvalidFactor(int32_t factor) const +{ + return factor > numOfBlock_ || factor <= 0; +} + +void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const MatmulRunParas& params, + BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + const int32_t coreNum = numOfBlock_; + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); + if (mnCore > 1) { + const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); + const int32_t mdim = static_cast(ceil(optPoint)); + const int32_t ndim = static_cast(ceil(mnCore / optPoint)); + MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); + MathUtil::AddFactor(blockDimRes.mDimFactors, ndim == 0 ? 1 : mnCore / ndim); + MathUtil::AddFactor(blockDimRes.nDimFactors, ndim); + MathUtil::AddFactor(blockDimRes.nDimFactors, mdim == 0 ? 1 : mnCore / mdim); + } +} + +void MatmulTilingAlgorithm::GenBlockDimsMapFactors(const std::string& opType, MatmulRunParas& params, + BlockDimCalculator& blockDimRes) const +{ + const int32_t coreNum = numOfBlock_; + blockDimRes.batchDimFactors.reserve(coreNum); + blockDimRes.mDimFactors.reserve(coreNum); + blockDimRes.nDimFactors.reserve(coreNum); + blockDimRes.kDimFactors.reserve(coreNum); + MathUtil::GetBlockFactors(blockDimRes.batchDimFactors, params.batch32, params.batchMapped, coreNum, + min(coreNum, params.batch32)); + MathUtil::GetBlockFactors(blockDimRes.mDimFactors, params.m32, params.mMapped, coreNum, min(coreNum, params.m32)); + MathUtil::GetBlockFactors(blockDimRes.nDimFactors, params.n32, params.nMapped, coreNum, min(coreNum, params.n32)); + // first get kDim candidate + if (!tilingIns_->enableSplitK_) { + blockDimRes.kDimFactors.push_back(1); + params.kMapped = params.k32; + } else { + MathUtil::GetBlockFactors(blockDimRes.kDimFactors, params.k32, params.kMapped, coreNum, coreNum); + } + AddOptimalFactors(opType, params, blockDimRes); +} + +void MatmulTilingAlgorithm::GetBlockDim(const std::string& opType, MatmulRunParas& params, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes) +{ + // get batchDim, kDim, mDim and nDim for single core + // support multi cores slicing along kDim + // single core batchDim, mDim, nDim, kDim is a factor of input batch, m, n, k + // multi-core strategy for mini shape's is different from other situations and requires preprocess + if (PreProcessMiniShape(opType, coreStatus, params, numOfBlock_, tilingIns_->enableSplitK_)) { + // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. + coreStatus.batchDim = MathUtil::CeilDivision(params.batch32, coreStatus.batch); + coreStatus.nDim = MathUtil::CeilDivision(params.n32, coreStatus.n); + coreStatus.mDim = MathUtil::CeilDivision(params.m32, coreStatus.m); + coreStatus.kDim = MathUtil::CeilDivision(params.k32, coreStatus.k); + UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : + TilingPolicy::NO_POLICY, + coreStatus); + splitCoreFlag_ = true; + return; + } + GenBlockDimsMapFactors(opType, params, blockDimRes); + for (const int32_t bFactor : blockDimRes.batchDimFactors) { + for (const int32_t nFactor : blockDimRes.nDimFactors) { + if (IsInvalidFactor(bFactor * nFactor)) { + continue; + } + for (const int32_t mFactor : blockDimRes.mDimFactors) { + if (IsInvalidFactor(bFactor * nFactor * mFactor)) { + continue; + } + for (const int32_t kFactor : blockDimRes.kDimFactors) { + if (IsInvalidFactor(bFactor * nFactor * mFactor * kFactor)) { + continue; + } + DimFactor blockDim(bFactor, mFactor, kFactor, nFactor); + GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); + } + } + } + } + + coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDimRes.batchDimFactor); + coreStatus.n = MathUtil::CeilDivision(params.n32, blockDimRes.nDimFactor); + coreStatus.m = MathUtil::CeilDivision(params.m32, blockDimRes.mDimFactor); + coreStatus.k = MathUtil::CeilDivision(params.k32, blockDimRes.kDimFactor); + if (g_tempCfg.factorSplit) { + const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); + const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); + const int32_t k = MathUtil::FindBestSingleCore(params.k32, params.kMapped, blockDimRes.kDimFactor, true); + const int32_t needCoreNum = static_cast(MathUtil::CeilDivision(params.batch32, coreStatus.batch) * + MathUtil::CeilDivision(params.n32, n) * + MathUtil::CeilDivision(params.m32, m) * + MathUtil::CeilDivision(params.k32, k)); + if (IsInvalidFactor(needCoreNum) == false) { + coreStatus.n = n; + coreStatus.m = m; + coreStatus.k = k; + } + } + + params.nonFactorK = params.k32 == params.kMapped ? false : true; + UpdateMultiCore(opType, params, coreStatus, blockDimRes); +} + +void MatmulTilingAlgorithm::NonFactorMap(const std::string& opType, MatmulRunParas& param, + BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + param.batchMapped = param.batch32; + param.mMapped = param.m32; + param.kMapped = param.k32; + param.nMapped = param.n32; + // Split k will introduce atomic_add which can't be used with shift_inwards. + // Thus in split k mode, batch/m/n/ can't use non-factorial segmentation. + if (tilingIns_->enableSplitK_) { + // it is only necessary to consider the non-factor splitting of k when splitKFlag is true + int32_t kFactorLess64Cnt = 0; + int32_t kFactorLess1024Cnt = 0; + MathUtil::GetFactorCnt(param.k32, kFactorLess64Cnt, 1, L0_FACTOR_LIMIT); + MathUtil::GetFactorCnt(param.k32, kFactorLess1024Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT); + if ((param.k32 > L0_FACTOR_LIMIT && kFactorLess64Cnt <= L0_FACTOR_NUM_LIMIT) || + (param.k32 > L1_FACTOR_LIMIT && kFactorLess64Cnt + kFactorLess1024Cnt <= L1_FACTOR_NUM_LIMIT)) { + // Non-factors of the k dimension use a down-aligned number of powers of 2 + param.kMapped = MathUtil::MapShape(param.k32, false); + } + } else { + MathUtil::GetFactorCnt(param.batch32, blockDimRes.batchFactorCnt, 1, numOfBlock_); + if (param.batch32 > 1 && blockDimRes.batchFactorCnt <= L0_FACTOR_NUM_LIMIT) { + param.batchMapped = MathUtil::MapShape(param.batch32); + } + param.mMapped = MathUtil::MapShape(param.m32); + param.nMapped = MathUtil::MapShape(param.n32); + } +} + +void MatmulTilingAlgorithm::FillParam(MatmulRunParas& param) +{ + param.oriShapeM = tilingIns_->orgM; + param.oriShapeN = tilingIns_->orgN; + param.oriShapeKa = tilingIns_->orgKa; + param.oriShapeKb = tilingIns_->orgKb; + int32_t realM = 1; + int32_t realN = 1; + int32_t realK = 1; + + if (tilingIns_->singleCoreM != -1 || tilingIns_->singleCoreK != -1 || tilingIns_->singleCoreN != -1) { + realM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; + realK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; + realN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; + singelBlockDim_ = true; + numOfBlock_ = 1; + } else { + realM = GetSingleM(); + realK = GetSingleK(); + realN = GetSingleN(); + singelBlockDim_ = false; + numOfBlock_ = tilingIns_->blockDim; + } + + const int32_t reduceBlockSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + param.k32 = MathUtil::CeilDivision(realK, reduceBlockSize); + param.m32 = MathUtil::CeilDivision(realM, C0_SIZE); + param.n32 = MathUtil::CeilDivision(realN, C0_SIZE); + param.mMapped = MathUtil::MapShape(param.m32, true); + param.kMapped = MathUtil::MapShape(param.k32, true); + param.nMapped = MathUtil::MapShape(param.n32, true); +} + +bool MatmulTilingAlgorithm::CheckFinaleParams(const CoreStatusPack& coreStatus) const +{ + (void)coreStatus; + const int32_t stepM = tilingIns_->tiling_.get_stepM(); + const int32_t stepN = tilingIns_->tiling_.get_stepN(); + const int32_t depthA1 = tilingIns_->tiling_.get_depthA1(); + const int32_t depthB1 = tilingIns_->tiling_.get_depthB1(); + + const int32_t l1Size = tilingIns_->tiling_.get_shareL1Size(); + const int32_t l0CSize = tilingIns_->tiling_.get_shareL0CSize(); + const int32_t uBSize = tilingIns_->tiling_.get_shareUbSize(); + + if (stepM == 0 || stepN == 0 || depthA1 == 0 || depthB1 == 0) { + TILING_LOG_WARNING("stepM/N depthA1/B1 should greate then zeros"); + return false; + } + + if (stepM > depthA1 || stepN > depthB1) { + TILING_LOG_WARNING("stepM/N should less then depthA1/B1"); + return false; + } + + if (l1Size > tilingIns_->bufferPool_.l1Size || l0CSize > tilingIns_->bufferPool_.l0CSize || + uBSize > tilingIns_->bufferPool_.ubSize) { + TILING_LOG_WARNING("L1/L0C/UB used size should less then L1Size/L0CSize/UbSize"); + return false; + } + + int dateDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t biasL1Size = tilingIns_->isBias ? + tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE : 0; + if (!tilingIns_->isBMNKBmm && tilingIns_->tiling_.get_BatchNum() > 0 && + ((tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK() + + tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK()) * + tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE + biasL1Size > + tilingIns_->bufferPool_.l1Size)) { + TILING_LOG_WARNING("a/b matrix size of batch mm should less then L1Size"); + return false; + } + + return true; +} + +void MatmulTilingAlgorithm::CheckL0DB(SingleCoreStatus& singleCoreStatus, const int32_t baseK) const +{ + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && + tilingIns_->aType_.scalePos == TPosition::TSCM) { + baseM = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + if (tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && + tilingIns_->bType_.scalePos == TPosition::TSCM) { + baseN = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + if (baseM * baseK > tilingIns_->bufferPool_.l0ASize / DB_ON) { + singleCoreStatus.l0Status.dbL0A = DB_OFF; + } + if (baseN * baseK > tilingIns_->bufferPool_.l0BSize / DB_ON) { + singleCoreStatus.l0Status.dbL0B = DB_OFF; + } + if (baseM * baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { + singleCoreStatus.l0Status.dbL0C = DB_OFF; + } +} + +void MatmulTilingAlgorithm::GetMxUsedL1Size(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, + int32_t& dataUsedL1Size, int32_t& scaleUsedL1Size, int32_t& biasUsedL1Size) const +{ + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; + + int32_t depthA1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1; + int32_t depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + dataUsedL1Size = depthA1 * baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE + + depthB1 * baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + // scale is fp8e8m0 + scaleUsedL1Size = depthA1 * baseM * baseK / SCALE_K_SIZE + + depthB1 * baseN * baseK / SCALE_K_SIZE; + // bias is fp32 + int32_t bias = tilingIns_->isBias ? 1 : 0; + biasUsedL1Size = bias * baseN * DTYPE_BIT_TAB.at(tilingIns_->biasType_.dataType) / BITS_PER_BYTE; +} + +void MatmulTilingAlgorithm::AdjustSparseL0Factors(SingleCoreStatus& singleCoreStatus) const +{ + // determine whether the scenario is sparse + if (!tilingIns_->isSparse_) { + TILING_LOG_DEBUG("Not sparse scenario does not need to adjust L0Factors."); + return; + } + + int32_t baseK = + singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + constexpr int32_t sparseBaseKFac = 64; // baseK need to align to 64 on Sparse + if (baseK <= sparseBaseKFac) { + baseK = sparseBaseKFac; + } else { + baseK = MathUtil::AlignDown(baseK, sparseBaseKFac); + } + singleCoreStatus.l0Status.kL0 = + baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + + // check L0A/L0B/L0Csize for L0 DB + CheckL0DB(singleCoreStatus, baseK); +} + +void MatmulTilingAlgorithm::AdjustMxL0Factors(SingleCoreStatus& singleCoreStatus) const +{ + // Determine wherther the scenario is MX. + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + if (!tilingIns_->aType_.hasSetScaleType) { + tilingIns_->aType_.scalePos = tilingIns_->aType_.pos; + tilingIns_->aType_.scaleType = tilingIns_->aType_.type; + tilingIns_->aType_.isScaleTrans = tilingIns_->aType_.isTrans; + } + if (!tilingIns_->bType_.hasSetScaleType) { + tilingIns_->bType_.scalePos = tilingIns_->bType_.pos; + tilingIns_->bType_.scaleType = tilingIns_->bType_.type; + tilingIns_->bType_.isScaleTrans = tilingIns_->bType_.isTrans; + } + // In the NZ scenario, ensure that the base size of the inner axis is 64-aligned downwards. + constexpr int32_t l0Factor = INT4_ALIGN_SIZE / C0_SIZE; + if (tilingIns_->aType_.type == CubeFormat::NZ && tilingIns_->aType_.isTrans) { + if (singleCoreStatus.l0Status.mL0 > l0Factor) { + singleCoreStatus.l0Status.mL0 = singleCoreStatus.l0Status.mL0 / l0Factor * l0Factor; + } + } + if (tilingIns_->bType_.type == CubeFormat::NZ && !tilingIns_->bType_.isTrans) { + if (singleCoreStatus.l0Status.nL0 > l0Factor) { + singleCoreStatus.l0Status.nL0 = singleCoreStatus.l0Status.nL0 / l0Factor * l0Factor; + } + } + // FP8 baseK need must be 64 element aligned + int32_t baseK = + singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if ((tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E5M2 || + tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E4M3FN) && + (tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E5M2 || + tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E4M3FN)) { + baseK = baseK <= MX_BASEK_FACTOR ? MX_BASEK_FACTOR : MathUtil::AlignDown(baseK, MX_BASEK_FACTOR); + singleCoreStatus.l0Status.kL0 = + baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + } + bool mL0NeedAlign = tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && + tilingIns_->aType_.scalePos == TPosition::TSCM; + if (mL0NeedAlign) { + singleCoreStatus.l0Status.mL0 = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT); + } + bool nL0NeedAlign = tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && + tilingIns_->bType_.scalePos == TPosition::TSCM; + if (nL0NeedAlign) { + singleCoreStatus.l0Status.nL0 = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT); + } + // check L0A/L0B/L0CSize for L0DB + CheckL0DB(singleCoreStatus, baseK); +} + +void MatmulTilingAlgorithm::AdjustMxL1Factors(SingleCoreStatus& singleCoreStatus, const int32_t k0Size) const +{ + // determine whether the scenario is MX + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + int32_t dataUsedL1Size = 0; + int32_t scaleUsedL1Size = 0; + int32_t biasUsedL1Size = 0; + GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); + // The existing tiling policy causes the L1 threshold to exceed the threshold. + // Adjust the tiling policy to the basic one. That is, only baseM * baseK + baseN * baseK is cached ai L1. + if (dataUsedL1Size + scaleUsedL1Size + biasUsedL1Size > tilingIns_->bufferPool_.l1Size) { + // checks whether the tiling is valid. + // If the tiling is invalid, the system uses the minimum tiling policy. + singleCoreStatus.l1Status.kAL1 = singleCoreStatus.l0Status.kL0; + singleCoreStatus.l1Status.kBL1 = singleCoreStatus.l0Status.kL0; + singleCoreStatus.l1Status.mAL1 = 1; + singleCoreStatus.l1Status.nBL1 = 1; + } +} + +void MatmulTilingAlgorithm::GetMxScaleFactor(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& mxTypePara) const +{ + // determine whether the scenario is MX + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + int32_t dataUsedL1Size = 0; + int32_t scaleUsedL1Size = 0; + int32_t biasUsedL1Size = 0; + GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); + + uint8_t scaleFactorA = 1; + uint8_t scaleFactorB = 1; + int32_t remainedL1Size = tilingIns_->bufferPool_.l1Size - (dataUsedL1Size + biasUsedL1Size); + int32_t singleCoreK = tilingIns_->tiling_.get_singleCoreK(); + int32_t stepKa = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0); + int32_t stepKb = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0); + int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; + int32_t kStep = MathUtil::CeilDivision(singleCoreK, baseK); + uint8_t maxScaleFactorA = static_cast(MathUtil::CeilDivision(kStep, stepKa)); + uint8_t maxScaleFactorB = static_cast(MathUtil::CeilDivision(kStep, stepKb)); + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + + // only support in K direction, scale DB same as data. + scaleFactorA = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKa * baseM * baseK / SCALE_K_SIZE)); + scaleFactorB = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKb * baseN * baseK / SCALE_K_SIZE)); + scaleFactorA = scaleFactorA > maxScaleFactorA ? maxScaleFactorA : scaleFactorA; + scaleFactorB = scaleFactorB > maxScaleFactorB ? maxScaleFactorB : scaleFactorB; + + // scaleFactor is in range of [1, 127] + scaleFactorA = scaleFactorA >= static_cast(1) ? scaleFactorA : static_cast(1); + scaleFactorB = scaleFactorB >= static_cast(1) ? scaleFactorB : static_cast(1); + scaleFactorA = scaleFactorA <= SCALE_FACTOR_MAX_VALUE ? scaleFactorA : SCALE_FACTOR_MAX_VALUE; + scaleFactorB = scaleFactorB <= SCALE_FACTOR_MAX_VALUE ? scaleFactorB : SCALE_FACTOR_MAX_VALUE; + + // 8bit: 0~6bit:scaleFactor, 7bit(reserved):double buffer flag + scaleFactorA = scaleFactorA & static_cast(0x7f); + scaleFactorB = scaleFactorB & static_cast(0x7F); + mxTypePara = static_cast(static_cast(mxTypePara) | scaleFactorA); + mxTypePara = static_cast(static_cast(mxTypePara) | static_cast(scaleFactorB << 8U)); +} + +void MatmulTilingAlgorithm::PreprocessL0DB() +{ + dbL0A_ = g_tempCfg.l0aDB; + dbL0B_ = g_tempCfg.l0bDB; + dbL0C_ = g_tempCfg.l0cDB; + if (tilingIns_->baseM != -1) { + const int32_t baseLeftSize = tilingIns_->baseM * C0_BYTE_SIZE; + if (baseLeftSize > tilingIns_->bufferPool_.l0ASize / DB_ON) { + dbL0A_ = DB_OFF; + } + } + if (tilingIns_->baseN != -1) { + const int32_t baseRightSize = tilingIns_->baseN * C0_BYTE_SIZE; + if (baseRightSize > tilingIns_->bufferPool_.l0BSize / DB_ON) { + dbL0B_ = DB_OFF; + } + } + if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { + const int32_t baseMatrixSize = tilingIns_->baseM * tilingIns_->baseN * C0_BYTE_SIZE; + if (baseMatrixSize > tilingIns_->bufferPool_.l0CSize / DB_ON) { + dbL0C_ = DB_OFF; + } + } + return; +} + +void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int32_t &b1LengthCache) const +{ + if (!tilingIns_->enableL1CacheUB || + tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND310P) { + return; + } + int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + a1LengthCache = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); + b1LengthCache = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); + int32_t freeL1Size = tilingIns_->bufferPool_.l1Size - tilingIns_->tiling_.get_depthA1() * a1Length - + tilingIns_->tiling_.get_depthB1() * b1Length; + if (freeL1Size <= 0) { + return; + } + const int32_t splitNum = 2; + int32_t aOrgShapeSize = tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK(); + int32_t bOrgShapeSize = tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK(); + + if ((tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) && + (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM)) { + bool aFullLoad = false; + bool bFullLoad = false; + aFullLoad = aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size / splitNum; + bFullLoad = bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size / splitNum; + if (aFullLoad && bFullLoad) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + tilingIns_->tiling_.set_depthBL1CacheUB(1); + a1LengthCache = aOrgShapeSize; // update + b1LengthCache = bOrgShapeSize; + } else if (aFullLoad) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + a1LengthCache = aOrgShapeSize; + int32_t depthL1CacheUB = b1LengthCache > 0 ? (freeL1Size - aOrgShapeSize) / b1LengthCache : 0; + tilingIns_->tiling_.set_depthBL1CacheUB(depthL1CacheUB); + } else if (bFullLoad) { + tilingIns_->tiling_.set_depthBL1CacheUB(1); + b1LengthCache = bOrgShapeSize; + int32_t depthL1CacheUB = a1LengthCache > 0 ? (freeL1Size - bOrgShapeSize) / a1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthL1CacheUB); + } else { + if (a1LengthCache > freeL1Size) { + int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / b1LengthCache : 0; + tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); + } else if (b1LengthCache > freeL1Size) { + int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / a1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); + } else if (a1LengthCache <= freeL1Size / splitNum && b1LengthCache <= freeL1Size / splitNum) { + int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / splitNum / a1LengthCache : 0; + int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / splitNum / b1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); + tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); + } else { + // can only cache one matrix + if (a1LengthCache <= b1LengthCache) { + tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); + } else { + tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); + } + } + } + } else if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) { + if (aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + a1LengthCache = aOrgShapeSize; + } else if (a1LengthCache > 0) { + tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); + } + } else if (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM) { + if (bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size) { + tilingIns_->tiling_.set_depthBL1CacheUB(1); + b1LengthCache = bOrgShapeSize; + } else if (b1LengthCache > 0) { + tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); + } + } else { + return; + } +} + +int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const +{ + int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + // only bType is f32 need update + if (tilingIns_->bType_.dataType != DataType::DT_FLOAT + || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + return depthB1; + } + uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; + uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; + + uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; + if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { + alignedBaseKM = alignedBaseK * tilingIns_->baseM; + } + // if L1 size is overflow, decrease depthB1 + if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) + > static_cast(tilingIns_->bufferPool_.l1Size)) { + depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; + depthB1 = depthB1 < 1 ? 1 : depthB1; + } + return depthB1; +} +int32_t MatmulTilingAlgorithm::GetSingleM() const +{ + return tilingIns_->singleM != -1 ? tilingIns_->singleM : tilingIns_->orgM; +} +int32_t MatmulTilingAlgorithm::GetSingleN() const +{ + return tilingIns_->singleN != -1 ? tilingIns_->singleN : tilingIns_->orgN; +} +int32_t MatmulTilingAlgorithm::GetSingleK() const +{ + return tilingIns_->singleK != -1 ? tilingIns_->singleK : tilingIns_->orgKa; +} +void MatmulTilingAlgorithm::GetSingleShape(const CoreStatusPack &coreStatus, const MatmulRunParas ¶m, + int32_t &singleCoreM, int32_t &singleCoreN, int32_t &singleCoreK) const +{ + singleCoreM = GetSingleM(); + singleCoreM = MathUtil::CeilDivision(singleCoreM, coreStatus.mDim); + singleCoreN = GetSingleN(); + singleCoreN = MathUtil::CeilDivision(singleCoreN, coreStatus.nDim); + singleCoreK = GetSingleK(); + singleCoreK = MathUtil::CeilDivision(singleCoreK, coreStatus.kDim); + if (singelBlockDim_) { + singleCoreM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; + singleCoreN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; + singleCoreK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; + } + if (numOfBlock_ > 1) { + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool needAlign = GetMultiCoreScenario(param) == MultiCoreScenario::SPLIT_MN; + bool needOutputAlign = NeedOutputAlign(singleCoreM, singleCoreN, singleCoreK); + (void)AlignSingleShape(needAlign && (!tilingIns_->bType_.isTrans || needOutputAlign), param.n32 * C0_SIZE, coreStatus.nDim, + bAlignSize, singleCoreN); + (void)AlignSingleShape(needAlign && tilingIns_->aType_.isTrans, param.m32 * C0_SIZE, coreStatus.mDim, + aAlignSize, singleCoreM); + if (tilingIns_->enableSplitK_) { + if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || + tilingIns_->bType_.dataType == DataType::DT_FLOAT) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * FLOAT32_REDUCE_BLOCK_SIZE; + } else if ((tilingIns_->aType_.dataType == DataType::DT_INT8 || + tilingIns_->bType_.dataType == DataType::DT_INT8)) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT8_REDUCE_BLOCK_SIZE; + } else if ((tilingIns_->aType_.dataType == DataType::DT_INT4 || + tilingIns_->bType_.dataType == DataType::DT_INT4)) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT4_REDUCE_BLOCK_SIZE; + } else { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * REDUCE_BLOCK_SIZE; + } + } + } +} + +bool MatmulTilingAlgorithm::CheckSingleShape(int32_t singleCoreM, int32_t singleCoreN, int32_t singleCoreK) const +{ + (void)singleCoreM; + (void)singleCoreK; + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad + if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && + (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { + TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " + "aligned, matrix c not support ND format"); + return false; + } + } + return true; +} + +int64_t MatmulTilingAlgorithm::Process() +{ + PreprocessL0DB(); + if (!CheckBaseMN()) { + TILING_LOG_WARNING("check baseM/baseN not pass"); + return -1; + } + singelBlockDim_ = false; + splitCoreFlag_ = false; + CoreStatusPack coreStatus; + SingleCoreStatus singleCoreStatus; + MatmulRunParas param; + BlockDimCalculator blockDimRes; + FillParam(param); + + std::string opType = "MatMul"; + if (numOfBlock_ != 1) { + NonFactorMap(opType, param, blockDimRes); + if (DoMultiCoreSplitMNTiling(param, coreStatus, blockDimRes)) { + return 0; + } + GetBlockDim(opType, param, coreStatus, blockDimRes); + } else { + if (!g_tempCfg.factorSplit) { + coreStatus.m = param.m32; + coreStatus.k = param.k32; + coreStatus.n = param.n32; + } else { + coreStatus.m = MathUtil::FindBestSingleCore(param.m32, param.mMapped, 1, false); + coreStatus.k = MathUtil::FindBestSingleCore(param.k32, param.kMapped, 1, false); + coreStatus.n = MathUtil::FindBestSingleCore(param.n32, param.nMapped, 1, false); + } + coreStatus.batchDim = 1; + coreStatus.mDim = 1; + coreStatus.kDim = 1; + coreStatus.nDim = 1; + } + + if (numOfBlock_ != 1 && tilingIns_->bType_.pos == TPosition::TSCM) { + if (!splitCoreFlag_) { + TILING_LOG_WARNING("Multi core split B TSCM full loaded is not sucess."); + return 1; + } + } + // single-core logic + GetL0Factors(opType, param, coreStatus, singleCoreStatus); + AdjustSparseL0Factors(singleCoreStatus); + AdjustMxL0Factors(singleCoreStatus); + if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || + singleCoreStatus.l0Status.kL0 == 0) { + TILING_LOG_WARNING("ml0/nl0/kl0 is zero"); + return -1; + } + GetL1Factors(opType, param, coreStatus, singleCoreStatus.l0Status, singleCoreStatus.l1Status); + if (UpdateTiling(param, coreStatus, singleCoreStatus) == -1L) { + return -1L; + } + const bool ans = CheckFinaleParams(coreStatus); + return ans ? 0 : -1; +} +} // namespace matmul_tiling \ No newline at end of file -- Gitee From e8ac8f6d3076be372d37d0bf38c26f9c74b18590 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:14:52 +0000 Subject: [PATCH 48/56] =?UTF-8?q?=E5=88=AA=E9=99=A4=E6=96=87=E4=BB=B6=20im?= =?UTF-8?q?pl/matmul/tiling/matmul=5Ftiling=5Falgorithm.cpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../matmul/tiling/matmul_tiling_algorithm.cpp | 3047 ----------------- 1 file changed, 3047 deletions(-) delete mode 100644 impl/matmul/tiling/matmul_tiling_algorithm.cpp diff --git a/impl/matmul/tiling/matmul_tiling_algorithm.cpp b/impl/matmul/tiling/matmul_tiling_algorithm.cpp deleted file mode 100644 index f4328ad9..00000000 --- a/impl/matmul/tiling/matmul_tiling_algorithm.cpp +++ /dev/null @@ -1,3047 +0,0 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file matmul_tiling_algorithm.cpp - * \brief - */ -#include "matmul_tiling_algorithm.h" - -#include -#include -#include -#include -#include - -#include "securec.h" - -#include "impl/host_log.h" -#include "math_util.h" - -using namespace std; - -namespace matmul_tiling { -constexpr uint32_t IDX_ZERO = 0; -constexpr uint32_t IDX_ONE = 1; -constexpr uint32_t IDX_TWO = 2; -constexpr uint32_t IDX_THREE = 3; -constexpr uint32_t IDX_FOUR = 4; -constexpr uint32_t IDX_FIVE = 5; -constexpr uint32_t IDX_SIX = 6; -constexpr uint32_t IDX_SEVEN = 7; -constexpr int32_t MAX_BIAS_N = 16; -constexpr int32_t MTE1_L0A_BANDWIDTH = 256; -constexpr int32_t MTE1_L0B_BANDWIDTH = 128; -constexpr int32_t INPUTDTYPE_BYTES = 2; -constexpr int32_t MIN_MTE1_LOAD = 32; -constexpr int32_t REDUCE_BLOCK_SIZE = 16; -constexpr int32_t INT8_REDUCE_BLOCK_SIZE = 32; -constexpr int32_t INT4_REDUCE_BLOCK_SIZE = 64; -constexpr int32_t FLOAT32_REDUCE_BLOCK_SIZE = 8; -constexpr int32_t MIN_FRACTAL_SIZE = C0_SIZE * REDUCE_BLOCK_SIZE; -constexpr uint32_t BEST_VALUE_LENGTH = 13; -constexpr int32_t BEST_VALUE_LIST[BEST_VALUE_LENGTH] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; -constexpr uint32_t DIM_FACTOR_LENGTH = 4; -constexpr uint64_t UINT64_TYPES = 8; -constexpr int32_t L0B_ALIGN_SIZE = 2; - -// minimum factor number requirement for the data amount in single-core -constexpr int32_t L0_FACTOR_NUM_LIMIT = 2; -constexpr int32_t L1_FACTOR_NUM_LIMIT = 4; -// the lower bound of the factor number check -constexpr int32_t L0_FACTOR_LIMIT = 64; -constexpr int32_t L1_FACTOR_LIMIT = 128; - -static MatmulTemplateCfg g_tempCfg; - -constexpr int32_t MTE1_FIXPIPE_BANDWIDTH = 128; // 128 Bytes per cycle - -constexpr int32_t NUM_TWO = 2; - -constexpr int32_t ATTACH_FLAG_ZERO = 0; -constexpr int32_t ATTACH_FLAG_ONE = 1; -constexpr int32_t ATTACH_FLAG_TWO = 2; - -constexpr int32_t INT8_ALIGN_SIZE = 32; -constexpr int32_t FP32_ALIGN_SIZE = 16; -constexpr int32_t INT4_ALIGN_SIZE = 64; -constexpr int32_t DATA_COPY_ALIGN_SIZE = 256; // 256B - -constexpr int DT_FLOAT_INVALID_BASEK = 8; - -MatmulTilingAlgorithm::MatmulTilingAlgorithm(MatmulApiTilingBase* tilingIns) -{ - ASCENDC_HOST_ASSERT(tilingIns != nullptr, {}, "tiling instance is null"); - tilingIns_ = tilingIns; -} - -int32_t MatmulTilingAlgorithm::GetBestValue(int32_t base) const -{ - for (uint32_t i = 0; i < BEST_VALUE_LENGTH; ++i) { - if (i == 0 || BEST_VALUE_LIST[i] <= base) { - continue; - } - return BEST_VALUE_LIST[i - 1]; - } - return BEST_VALUE_LIST[BEST_VALUE_LENGTH - 1]; -} - -void MatmulTilingAlgorithm::GetTwoFactors(int32_t (&res)[2], int32_t base, int32_t dim, int32_t maxNum) const -{ - if (dim == 1) { - res[0] = 1; - res[1] = 1; - return; - } - - res[0] = 0; - res[1] = 0; - - int cnt = 0; - for (auto up = base + 1; up <= maxNum && up <= dim; ++up) { - if (dim % up == 0) { - res[cnt++] = up; - break; - } - } - - for (auto down = base; down >= 1; --down) { - if (dim % down == 0) { - res[cnt++] = down; - if (cnt == sizeof(res) / sizeof(res[0])) { - break; - } - } - } -} - -void MatmulTilingAlgorithm::GetABL1KAlignValue(int32_t& kaAlignValue, int32_t& kbAlignValue) const -{ - kaAlignValue = 1; - kbAlignValue = 1; - if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || tilingIns_->bType_.dataType == DataType::DT_FLOAT) { - // when in FP32 mode, k_a must be an even number if k-alignment is needed. So make ka_align_value as 2. - kaAlignValue = tilingIns_->aType_.isTrans ? 2 : 1; - // Same as previous one, make kb_align_value as 2 when k-alignment is needed - kbAlignValue = (tilingIns_->aType_.isTrans || !tilingIns_->bType_.isTrans) ? 2 : 1; - } -} - -void MatmulTilingAlgorithm::GetL0StatusFromParasCombo(L0StatusPack& l0Status, int32_t* parasCombo) const -{ - l0Status.InitLoadStatus(); - size_t kIdx = 0; - l0Status.dbL0A = parasCombo[kIdx++]; - l0Status.dbL0B = parasCombo[kIdx++]; - l0Status.dbL0C = parasCombo[kIdx++]; - l0Status.maxMk = parasCombo[kIdx++]; - l0Status.maxNk = parasCombo[kIdx++]; - l0Status.maxMn = parasCombo[kIdx++]; - l0Status.maxAxisIdx = parasCombo[kIdx++]; - l0Status.maxAxisNum = parasCombo[kIdx++]; - l0Status.maxAxisPnt = parasCombo[kIdx++]; - l0Status.maxN = parasCombo[kIdx++]; - l0Status.maxAxisPnt = min(l0Status.maxAxisPnt, l0Status.maxAxisNum); -} - -void MatmulTilingAlgorithm::SetResFactors(L0Factors& resFactors, const L0StatusPack& l0Status) const -{ - resFactors.finalML0 = l0Status.finalML0; - resFactors.finalKL0 = l0Status.finalKL0; - resFactors.finalNL0 = l0Status.finalNL0; - resFactors.finalLoadSize = l0Status.finalLoadSize; - resFactors.finalL0cUse = l0Status.finalL0cUse; - resFactors.finalMte1Loop = l0Status.finalMte1Loop; - resFactors.finalMul = l0Status.finalMul; - resFactors.finalMte1Cycles = l0Status.finalMte1Cycles; -} - -int32_t MatmulTilingAlgorithm::GetLoadSize(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status) const -{ - const bool al0FullLoad = - (static_cast(coreStatus.m * coreStatus.k) * static_cast(C0_SIZE * C0_BYTE_SIZE)) <= - static_cast(tilingIns_->bufferPool_.l0ASize); - const bool bl0FullLoad = - (static_cast(coreStatus.n * coreStatus.k) * static_cast(C0_SIZE * C0_BYTE_SIZE)) <= - static_cast(tilingIns_->bufferPool_.l0BSize); - const bool kFullLoad = (l0Status.kL0 >= coreStatus.k); - if (al0FullLoad || bl0FullLoad) { - return coreStatus.m + coreStatus.n; - } else if (kFullLoad) { - return min(coreStatus.n + MathUtil::CeilDivision(coreStatus.n, l0Status.nL0) * coreStatus.m, - coreStatus.m + MathUtil::CeilDivision(coreStatus.m, l0Status.mL0) * coreStatus.n); - } else { - return MathUtil::CeilDivision(coreStatus.m, l0Status.mL0) * coreStatus.n + - MathUtil::CeilDivision(coreStatus.n, l0Status.nL0) * coreStatus.m; - } -} - -bool MatmulTilingAlgorithm::CheckBaseMNKL1Size(SingleCoreStatus& singleCoreStatus) const -{ - L0StatusPack& l0Status = singleCoreStatus.l0Status; - int32_t a1Length = static_cast(l0Status.mL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE); - int32_t b1Length = static_cast(l0Status.nL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE); - int32_t biasLength = (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) - ? l0Status.nL0 * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType) - : 0; - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - if (tilingIns_->aType_.pos == TPosition::TSCM) { - a1Length = 0; - } - if (tilingIns_->bType_.pos == TPosition::TSCM) { - b1Length = 0; - } - // Only V220/V300 bias uses L1 space. - if (tilingIns_->biasType_.pos == TPosition::TSCM || !tilingIns_->isBias) { - biasLength = 0; - } - const int32_t totalLength = a1Length + b1Length + biasLength + dequantSize; - return totalLength <= tilingIns_->bufferPool_.l1Size; -} - -bool MatmulTilingAlgorithm::CheckK0Align(int32_t k0) const -{ - if ((tilingIns_->aType_.dataType == DataType::DT_FLOAT && tilingIns_->aType_.type == CubeFormat::NZ && - tilingIns_->aType_.isTrans) || - (tilingIns_->bType_.dataType == DataType::DT_FLOAT && tilingIns_->bType_.type == CubeFormat::NZ && - !tilingIns_->bType_.isTrans)) { - return k0 % NUM_TWO == 0; - } - return true; -} - -void MatmulTilingAlgorithm::GetFinalMkn(SingleCoreStatus& singleCoreStatus, const CoreStatusPack& coreStatus, - const int32_t& k0, const int32_t& majorDimFactor, const int32_t& minorDimFactor) const -{ - if (k0 == 0) { - return; - } - L0StatusPack& l0Status = singleCoreStatus.l0Status; - if (l0Status.maxAxisIdx == 0) { - l0Status.mL0 = majorDimFactor; - l0Status.nL0 = minorDimFactor; - } else { - l0Status.mL0 = minorDimFactor; - l0Status.nL0 = majorDimFactor; - } - l0Status.kL0 = k0; - const float tmpL0cUse = static_cast(l0Status.mL0 * l0Status.nL0 * l0Status.dbL0C * - MIN_FRACTAL_SIZE * FP32_BYTES * 1.0 / tilingIns_->bufferPool_.l0CSize); - // NUM_TWO means L0A and L0B double buffer is default-on. - - const int32_t tmpMte1Cycle = - max(2 * 3, l0Status.mL0 * l0Status.kL0 * C0_SIZE * C0_BYTE_SIZE / MTE1_L0A_BANDWIDTH) + - max(2 * 3, l0Status.kL0 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE / MTE1_L0B_BANDWIDTH); - const int32_t tmpMadCycle = l0Status.mL0 * l0Status.kL0 * l0Status.nL0; // (m<=4 or n<=2:tmpMte1Cycle > tmpMadCycle) - const int32_t tmpLoadSize = GetLoadSize(coreStatus, l0Status); - // calculate load2d loop: A splitK for K loop; B split K for m loop as to V100 - const int32_t tmpMte1Loop = ((l0Status.nL0 != 1) ? l0Status.kL0 : 1) + ((l0Status.kL0 != 1) ? l0Status.mL0 : 1); - - const bool condition1 = l0Status.finalML0 == 0; - const bool condition2 = - (tmpLoadSize < l0Status.finalLoadSize) || (tmpMte1Cycle < tmpMadCycle && !l0Status.updateUsingMte1); - const bool condition3 = (tmpLoadSize == l0Status.finalLoadSize && tmpMadCycle > l0Status.finalMul && - tmpMadCycle * tmpL0cUse >= l0Status.finalMul * l0Status.finalL0cUse); - const bool condition4 = tmpMadCycle == l0Status.finalMul && tmpLoadSize == l0Status.finalLoadSize && - tmpMte1Loop < l0Status.finalMte1Loop; - // Considering pipeline parallelism between MTE1 and MAD - const bool condition5 = ((tmpMte1Cycle < tmpMadCycle && l0Status.updateUsingMte1) || !l0Status.updateUsingMte1); - const bool condition6 = CheckBaseMNKL1Size(singleCoreStatus); - int32_t lastReduceDim = (tilingIns_->aType_.dataType == DataType::DT_FLOAT || - tilingIns_->bType_.dataType == DataType::DT_FLOAT) ? FLOAT32_REDUCE_BLOCK_SIZE : REDUCE_BLOCK_SIZE; - - const bool condition7 = (tilingIns_->baseN != -1) || (!(coreStatus.n >= lastReduceDim && l0Status.nL0 < - lastReduceDim)); - - const bool condition8 = CheckK0Align(l0Status.kL0); - - const bool validL0 = (condition1 || condition2 || condition3 || condition4) && condition5 && - condition6 && condition7 && condition8; - if (validL0) { - l0Status.finalML0 = l0Status.mL0; - l0Status.finalKL0 = l0Status.kL0; - l0Status.finalNL0 = l0Status.nL0; - l0Status.finalLoadSize = tmpLoadSize; - l0Status.finalL0cUse = tmpL0cUse; - l0Status.finalMul = tmpMadCycle; - l0Status.finalMte1Cycles = tmpMte1Cycle; - l0Status.finalMte1Loop = tmpMte1Loop; - l0Status.updateUsingMte1 = l0Status.updateUsingMte1 || (tmpMte1Cycle < tmpMadCycle); - } -} - -void MatmulTilingAlgorithm::GetL0bAlign(std::vector& factors) const -{ - constexpr int32_t alignSize = 2; - if (tilingIns_->bType_.dataType == DataType::DT_INT8 || tilingIns_->bType_.dataType == DataType::DT_INT4) { - for (auto& num : factors) { - num = MathUtil::Align(num, alignSize); - } - } - return; -} - -void MatmulTilingAlgorithm::GetL0FactorsCand(L0Factors& resFactors, const CoreStatusPack& coreStatus, - SingleCoreStatus& singleCoreStatus, int32_t* parasCombo, const MatmulRunParas& param) const -{ - (void)(param); - L0StatusPack& l0Status = singleCoreStatus.l0Status; - GetL0StatusFromParasCombo(l0Status, parasCombo); - int32_t l0bAlignSize = 1; - if (tilingIns_->bType_.dataType == DataType::DT_INT8 || tilingIns_->bType_.dataType == DataType::DT_INT4) { - l0bAlignSize = L0B_ALIGN_SIZE; - } - int32_t majorDim = coreStatus.m; - int32_t minorDim = MathUtil::Align(coreStatus.n, l0bAlignSize); - int32_t majorDimK = l0Status.maxMk; - int32_t minorDimK = l0Status.maxNk; - int32_t maxN = l0Status.maxN; - int32_t dimFactors[2] = {0}; - if (l0Status.maxAxisIdx != 0) { - majorDim = MathUtil::Align(coreStatus.n, l0bAlignSize); - minorDim = coreStatus.m; - majorDimK = l0Status.maxNk; - minorDimK = l0Status.maxMk; - } - - std::vector majorDimFactors(DIM_FACTOR_LENGTH, 0); - if (tilingIns_->baseN != -1 && l0Status.maxAxisIdx != 0) { - majorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseN, C0_SIZE); - } else if (tilingIns_->baseM != -1 && l0Status.maxAxisIdx == 0) { - majorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseM, C0_SIZE); - } else { - // n dim condition - if (l0Status.maxAxisIdx != 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - GetTwoFactors(dimFactors, min(l0Status.maxAxisPnt, maxN), majorDim, min(l0Status.maxAxisNum, maxN)); - } else { - GetTwoFactors(dimFactors, l0Status.maxAxisPnt, majorDim, l0Status.maxAxisNum); - } - majorDimFactors[0] = dimFactors[0]; - majorDimFactors[1] = dimFactors[1]; - const int32_t majorAmend = GetBestValue(majorDim); - if (l0Status.maxAxisIdx != 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - GetTwoFactors(dimFactors, min(l0Status.maxAxisPnt, maxN), majorAmend, min(l0Status.maxAxisNum, maxN)); - } else { - GetTwoFactors(dimFactors, l0Status.maxAxisPnt, majorAmend, l0Status.maxAxisNum); - } - majorDimFactors[IDX_TWO] = dimFactors[0]; - majorDimFactors[IDX_THREE] = dimFactors[1]; - if (l0Status.maxAxisIdx != 0) { - GetL0bAlign(majorDimFactors); - } - } - sort(majorDimFactors.rbegin(), majorDimFactors.rend()); - majorDimFactors.erase(unique(majorDimFactors.begin(), majorDimFactors.end()), majorDimFactors.end()); - for (auto& majorDimFactor : majorDimFactors) { - if (majorDimFactor == 0 || majorDimFactor > l0Status.maxMn || majorDimFactor > majorDimK || - majorDimFactor > majorDim) { - continue; - } - const int32_t minorFactorMax = min(l0Status.maxMn / majorDimFactor, minorDimK); - std::vector minorDimFactors(DIM_FACTOR_LENGTH, 0); - if (tilingIns_->baseN != -1 && l0Status.maxAxisIdx == 0) { - minorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseN, C0_SIZE); - } else if (tilingIns_->baseM != -1 && l0Status.maxAxisIdx != 0) { - minorDimFactors[0] = MathUtil::CeilDivision(tilingIns_->baseM, C0_SIZE); - } else { - if (l0Status.maxAxisIdx == 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - GetTwoFactors(dimFactors, min(minorFactorMax, maxN), minorDim, min(minorFactorMax, maxN)); - } else { - GetTwoFactors(dimFactors, minorFactorMax, minorDim, minorFactorMax); - } - minorDimFactors[0] = dimFactors[0]; - minorDimFactors[1] = dimFactors[1]; - const int32_t minorAmend = GetBestValue(minorDim); - if (l0Status.maxAxisIdx == 0 && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - GetTwoFactors(dimFactors, min(minorFactorMax, maxN), minorAmend, min(minorFactorMax, maxN)); - } else { - GetTwoFactors(dimFactors, minorFactorMax, minorAmend, minorFactorMax); - } - minorDimFactors[IDX_TWO] = dimFactors[0]; - minorDimFactors[IDX_THREE] = dimFactors[1]; - if (l0Status.maxAxisIdx == 0) { - GetL0bAlign(minorDimFactors); - } - } - sort(minorDimFactors.rbegin(), minorDimFactors.rend()); - minorDimFactors.erase(unique(minorDimFactors.begin(), minorDimFactors.end()), minorDimFactors.end()); - for (auto& minorDimFactor : minorDimFactors) { - if (minorDimFactor == 0 || minorDimFactor * majorDimFactor > l0Status.maxMn || minorDimFactor > minorDimK || - (minorDimFactor > minorDim) || (minorDimFactor > majorDimK)) { - continue; - } - // consider bias table buffer - constexpr int32_t maxN0 = 64; - // in V220/V300, consider bias table buffer limit - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES / l0Status.dbL0C; - } - if (l0Status.maxAxisIdx != 0) { - // Major is n0 axis - if ((majorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - continue; - } - } else { - // Major is m0 axis - if ((minorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - continue; - } - } - - const int32_t k0Max = min(majorDimK / majorDimFactor, minorDimK / minorDimFactor); - std::vector k0Factors(DIM_FACTOR_LENGTH, 0); - GetTwoFactors(dimFactors, k0Max, coreStatus.k, k0Max); - k0Factors[0] = dimFactors[0]; - k0Factors[1] = dimFactors[1]; - const int32_t kAmend = GetBestValue(coreStatus.k); - GetTwoFactors(dimFactors, k0Max, kAmend, l0Status.maxAxisNum); - k0Factors[IDX_TWO] = dimFactors[0]; - k0Factors[IDX_THREE] = dimFactors[1]; - sort(k0Factors.rbegin(), k0Factors.rend()); - k0Factors.erase(unique(k0Factors.begin(), k0Factors.end()), k0Factors.end()); - for (auto& k0 : k0Factors) { - if (k0 == 0 || minorDimFactor * k0 > minorDimK || majorDimFactor * k0 > majorDimK) { - continue; - } - - // Check if the buffer size allocated exceed the hardware buffer size in Float Mode - if (tilingIns_->aType_.dataType == DataType::DT_FLOAT) { - int32_t mL0 = majorDimFactor; - int32_t nL0 = minorDimFactor; - if (l0Status.maxAxisIdx != 0) { - nL0 = majorDimFactor; - mL0 = minorDimFactor; - } - - const int32_t l0aBufferSize = tilingIns_->aType_.isTrans ? - MathUtil::Align(k0, 2) * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON : - k0 * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON; - const int32_t l0bBufferSize = (tilingIns_->aType_.isTrans || !tilingIns_->bType_.isTrans) ? - MathUtil::Align(k0, 2) * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON : - k0 * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON; - if (l0aBufferSize > tilingIns_->bufferPool_.l0ASize || - l0bBufferSize > tilingIns_->bufferPool_.l0BSize) { - continue; - } - } else if (tilingIns_->aType_.dataType == DataType::DT_INT8 || - tilingIns_->aType_.dataType == DataType::DT_INT4) { - int32_t mL0 = majorDimFactor; - int32_t nL0 = minorDimFactor; - if (l0Status.maxAxisIdx != 0) { - nL0 = majorDimFactor; - mL0 = minorDimFactor; - } - - const int32_t l0aBufferSize = tilingIns_->aType_.isTrans ? - k0 * C0_BYTE_SIZE * MathUtil::Align(mL0, 2) * C0_SIZE * DB_ON : - k0 * C0_BYTE_SIZE * mL0 * C0_SIZE * DB_ON; - int32_t l0bBufferSize = (tilingIns_->bType_.isTrans) ? - k0 * C0_BYTE_SIZE * nL0 * C0_SIZE * DB_ON : - k0 * C0_BYTE_SIZE * MathUtil::Align(nL0, 2) * C0_SIZE * DB_ON; - if (l0aBufferSize > tilingIns_->bufferPool_.l0ASize || - l0bBufferSize > tilingIns_->bufferPool_.l0BSize) { - continue; - } - } - GetFinalMkn(singleCoreStatus, coreStatus, k0, majorDimFactor, minorDimFactor); - } - } - } - if (l0Status.finalML0 != 0 && l0Status.finalKL0 != 0 && l0Status.finalNL0 != 0) { - SetResFactors(resFactors, l0Status); - } -} - -MKNParasCombo MatmulTilingAlgorithm::GetParasCombo(const int32_t& index, const MatmulRunParas& param) const -{ - (void)(param); - std::map parasComboMap; - const int32_t mnMax = tilingIns_->bufferPool_.l0CSize / (C0_SIZE * C0_SIZE) / FP32_BYTES; - int32_t maxN = 64; - // in V220/V300, consider bias table buffer limit - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES; - } - const bool biasBt = tilingIns_->isSupportL0c2Out && (tilingIns_->isBias); - const int32_t leftSize = min(tilingIns_->bufferPool_.l1Size, tilingIns_->bufferPool_.l0ASize / dbL0A_); - const int32_t rightSize = min(tilingIns_->bufferPool_.l1Size, tilingIns_->bufferPool_.l0BSize / dbL0B_); - const int32_t maxMk = tilingIns_->aType_.pos == TPosition::TSCM ? 64 : (leftSize / C0_SIZE / C0_BYTE_SIZE); - const int32_t maxNK = tilingIns_->bType_.pos == TPosition::TSCM ? 64 : (rightSize / C0_SIZE / C0_BYTE_SIZE); - // dbL0A, dbL0B, dbL0C, maxMk, maxNk, maxMn, maxAxisIdx, maxAxisNum, maxAxisPnt, maxN - MKNParasCombo comboZero = { 2, 2, 2, maxMk, maxNK, mnMax / DB_ON, 0, 64, 8, biasBt ? maxN / DB_ON : 64 }; - MKNParasCombo comboOne = { dbL0A_, dbL0B_, 1, maxMk, maxNK, mnMax, 0, 64, 11, biasBt ? maxN : 64 }; - parasComboMap = { { 0, comboZero }, { 1, comboOne } }; - - return parasComboMap[index]; -} - -void MatmulTilingAlgorithm::GetL0cDB(const L0Factors (&resFactors)[L0PARAS_COMBO_LEN], const CoreStatusPack& coreStatus, - L0StatusPack& l0Status) const -{ - const int32_t dbAOnBOnCOnIdx = 0; - const int32_t dbAOnBOnCOffIdx = 1; - // check both L0C utilization and loadsize to control LOC LOA LOB DB - const int32_t m0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalML0; - const int32_t k0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalKL0; - const int32_t n0L0cDbOn = resFactors[dbAOnBOnCOnIdx].finalNL0; - const int32_t loadSizeL0cDbOn = resFactors[dbAOnBOnCOnIdx].finalLoadSize; - const int32_t mte1CyclesL0cDbOn = resFactors[dbAOnBOnCOnIdx].finalMte1Cycles; - - const int32_t m0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalML0; - const int32_t k0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalKL0; - const int32_t n0L0cDbOff = resFactors[dbAOnBOnCOffIdx].finalNL0; - const int32_t loadSizeL0cDbOff = resFactors[dbAOnBOnCOffIdx].finalLoadSize; - const int32_t mte1CyclesL0cDbOff = resFactors[dbAOnBOnCOffIdx].finalMte1Cycles; - - const int32_t mte3CostDbOn = - m0L0cDbOn * n0L0cDbOn * MIN_FRACTAL_SIZE * FP16_BYTES * 1 / MTE1_FIXPIPE_BANDWIDTH; - const int32_t mte3CostDbOff = - m0L0cDbOff * n0L0cDbOff * MIN_FRACTAL_SIZE * FP16_BYTES * 1 / MTE1_FIXPIPE_BANDWIDTH; - - const int32_t madCylesDbOn = max(m0L0cDbOn * k0L0cDbOn * n0L0cDbOn, static_cast(mte1CyclesL0cDbOn * 0.7)); - const int32_t madCylesDbOff = - max(m0L0cDbOff * k0L0cDbOff * n0L0cDbOff, static_cast(mte1CyclesL0cDbOff * 0.7)); - int32_t dbOnPipeTime = MathUtil::CeilDivision(coreStatus.m, m0L0cDbOn) * - MathUtil::CeilDivision(coreStatus.n, n0L0cDbOn) * - ((MathUtil::CeilDivision(coreStatus.k, k0L0cDbOn) - 1) * madCylesDbOn + max(madCylesDbOn, mte3CostDbOn)); - int32_t dbOffPipeTime = MathUtil::CeilDivision(coreStatus.m, m0L0cDbOff) * - MathUtil::CeilDivision(coreStatus.n, n0L0cDbOff) * - (MathUtil::CeilDivision(coreStatus.k, k0L0cDbOff) * madCylesDbOff + mte3CostDbOff); - dbOnPipeTime = dbOnPipeTime == 0 ? INT32_MAX : dbOnPipeTime; - dbOffPipeTime = dbOffPipeTime == 0 ? INT32_MAX : dbOffPipeTime; - - if ((dbOffPipeTime < dbOnPipeTime) || (loadSizeL0cDbOff < loadSizeL0cDbOn)) { - l0Status.dbL0C = 1; - l0Status.dbL0A = dbL0A_; - l0Status.dbL0B = dbL0B_; - l0Status.mL0 = m0L0cDbOff; - l0Status.kL0 = k0L0cDbOff; - l0Status.nL0 = n0L0cDbOff; - } else { - l0Status.dbL0C = DB_ON; - l0Status.dbL0A = dbL0A_; - l0Status.dbL0B = dbL0B_; - l0Status.mL0 = m0L0cDbOn; - l0Status.kL0 = k0L0cDbOn; - l0Status.nL0 = n0L0cDbOn; - } -} - -void MatmulTilingAlgorithm::GetL0Factors(const std::string& opType, const MatmulRunParas& param, - const CoreStatusPack& coreStatus, SingleCoreStatus& singleCoreStatus) const -{ - (void)(opType); - // get mL0, nL0, kL0 factor when singlecore m, n, k is know - // mL0, nL0, kL0 is a factor of single core m, n, k - L0StatusPack& l0Status = singleCoreStatus.l0Status; - if (tilingIns_->isBias) { - l0Status.dtypeBias = DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - } - L0Factors resFactors[L0PARAS_COMBO_LEN]; - for (int32_t i = 0; i < L0PARAS_COMBO_LEN; ++i) { - if (i == 0 && g_tempCfg.l0cDB == DB_OFF) { - continue; - } - MKNParasCombo mknParasCombo = GetParasCombo(i, param); - for (int32_t j = 0; j < L0PARAS_COMBO_LEN; ++j) { - mknParasCombo.parasCombo[IDX_SIX] = j; - GetL0FactorsCand(resFactors[i], coreStatus, singleCoreStatus, mknParasCombo.parasCombo, param); - } - } - if (g_tempCfg.l0cDB == DB_OFF) { - l0Status.dbL0C = DB_OFF; - l0Status.dbL0A = dbL0A_; - l0Status.dbL0B = dbL0B_; - l0Status.mL0 = resFactors[1].finalML0; - l0Status.kL0 = resFactors[1].finalKL0; - l0Status.nL0 = resFactors[1].finalNL0; - } else { - GetL0cDB(resFactors, coreStatus, l0Status); - } -} - -bool MatmulTilingAlgorithm::IsNeedAlign(bool IsA) const -{ - if (IsA) { - return tilingIns_->aType_.dataType == DataType::DT_FLOAT || (tilingIns_->aType_.dataType == DataType::DT_INT8 && tilingIns_->aType_.isTrans); - } else { - return tilingIns_->bType_.dataType == DataType::DT_FLOAT || (tilingIns_->bType_.dataType == DataType::DT_INT8 && !tilingIns_->bType_.isTrans); - } -} - -int32_t MatmulTilingAlgorithm::GetL1Size(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const -{ - int32_t curAL1Size = 0; - int32_t curBL1Size = 0; - int32_t channelWiseL1Size = 0; - int32_t aL1Const = C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; - if (IsNeedAlign(true)) { - aL1Const *= NUM_TWO; - } - // 5/8 means 1/2(B Matrix size) + 1/8(Index Matrix size) - int32_t bL1Const = tilingIns_->isSparse_ ? C0_SIZE * (C0_BYTE_SIZE / 8) * 5 * l1Status.dbBL1 : - C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; - if (IsNeedAlign(false)) { - bL1Const *= NUM_TWO; - } - const int32_t channelWiseL1Const = l1Status.channelWiseTimes * C0_SIZE * l1Status.dbBL1 * l0Status.dtypeBias; - int32_t dequantSize = 0; - - int32_t kaAlignValue = 1; - int32_t kbAlignValue = 1; - GetABL1KAlignValue(kaAlignValue, kbAlignValue); - if (!MathUtil::CheckMulOverflow(l1Status.mAL1, l0Status.mL0, curAL1Size) || - !MathUtil::CheckMulOverflow(curAL1Size, aL1Const, curAL1Size) || - !MathUtil::CheckMulOverflow(curAL1Size, MathUtil::Align(l1Status.kAL1, kaAlignValue), curAL1Size)) { - return 0; - } - if (!MathUtil::CheckMulOverflow(l1Status.nBL1, l0Status.nL0, curBL1Size) || - !MathUtil::CheckMulOverflow(curBL1Size, bL1Const, curBL1Size) || - !MathUtil::CheckMulOverflow(curBL1Size, MathUtil::Align(l1Status.kBL1, kbAlignValue), curBL1Size)) { - return 0; - } - - if (l1Status.channelWiseTimes > 0) { - if (!MathUtil::CheckMulOverflow(l1Status.nBL1, l0Status.nL0, channelWiseL1Size) || - !MathUtil::CheckMulOverflow(channelWiseL1Size, channelWiseL1Const, channelWiseL1Size)) { - return 0; - } - } - - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - - if (tilingIns_->aType_.pos == TPosition::TSCM) { - curAL1Size = 0; - } - if (tilingIns_->bType_.pos == TPosition::TSCM) { - curBL1Size = 0; - } - if (tilingIns_->biasType_.pos == TPosition::TSCM) { - channelWiseL1Size = 0; - } - - const int64_t totalSize = static_cast(curAL1Size) + static_cast(curBL1Size) + - static_cast(channelWiseL1Size) + static_cast(dequantSize); - return totalSize > INT_MAX ? INT_MAX : static_cast(totalSize); -} - -int32_t MatmulTilingAlgorithm::CalL1MaxLen(int32_t resL1Size, L1StatusPack& l1Status, const L0StatusPack& l0Status, - const int32_t alignValue, const L1TilingType axisName) const -{ - int32_t axisMaxLen = 1; - if (axisName == L1TilingType::KAL1_16) { - axisMaxLen = resL1Size / - (l1Status.mAL1 * l0Status.mL0 * l1Status.dbAL1 * C0_SIZE * C0_BYTE_SIZE); - } - if (axisName == L1TilingType::KBL1_16) { - axisMaxLen = resL1Size / - (l1Status.nBL1 * l0Status.nL0 * l1Status.dbBL1 * C0_SIZE * C0_BYTE_SIZE); - } - axisMaxLen = MathUtil::AlignDown(axisMaxLen, alignValue); - if (axisName == L1TilingType::M_AL1) { - axisMaxLen = resL1Size / (MathUtil::Align(l1Status.kAL1, alignValue) * l0Status.mL0 * l1Status.dbAL1 * C0_SIZE * - C0_BYTE_SIZE); - } - if (axisName == L1TilingType::N_BL1) { - axisMaxLen = resL1Size / (MathUtil::Align(l1Status.kBL1, alignValue) * l0Status.nL0 * l1Status.dbBL1 * C0_SIZE * - C0_BYTE_SIZE + l1Status.channelWiseTimes * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE); - } - return axisMaxLen; -} - -/* - brief: - if factor greater then base, then factor = base - if factor less than base, then get thr max factor of base, i.e. base 10, factor 9, then res factor = 5 -*/ -void MatmulTilingAlgorithm::GetNearestFactor(const int32_t& base, int32_t& factor, int32_t capValue) const -{ - if (!g_tempCfg.factorSplit) { - return; - } - if (capValue == INT32_MAX) { - capValue = base; - } - while ((factor > capValue) || (factor > 0 && base % factor != 0)) { - factor--; - } -} - -void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const -{ - // if b martix in L1, then b matrix must full load, goto b matrix full load patch - if (tilingIns_->bType_.pos == TPosition::TSCM) { - return; - } - const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); - const int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); - int32_t kaAlignValue = 1; - int32_t kbAlignValue = 1; - GetABL1KAlignValue(kaAlignValue, kbAlignValue); - l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; - const int32_t curL1Size = GetL1Size(l1Status, l0Status); - const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); - if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && a1Length < tilingIns_->bufferPool_.ubSize) { - l1Status.aL1FullLoad = true; - l1Status.aL1Size = - max(MathUtil::Align(coreStatus.k, kaAlignValue), MathUtil::Align(l1Status.kAL1, kaAlignValue)) * - max(l1Status.mAL1 * l0Status.mL0, coreStatus.m) * C0_SIZE * C0_BYTE_SIZE; - if (tilingIns_->aType_.pos == TPosition::TSCM) { - l1Status.bL1Size = tilingIns_->bufferPool_.l1Size; - } else { - l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; - } - if (g_tempCfg.l1DB == DB_ON) { - l1Status.dbBL1 = DB_ON; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { - l1Status.dbBL1 = DB_OFF; - } - } - const int32_t biasSize = - l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - l1Status.kBL1 = min(CalL1MaxLen((l1Status.bL1Size - biasSize - dequantSize), l1Status, l0Status, kbAlignValue, - L1TilingType::KBL1_16), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.dbBL1 = DB_OFF; - const int32_t b1Length = tilingIns_->bufferPool_.ubSize - a1Length; - l1Status.kBL1 = min(CalL1MaxLen(min(l1Status.bL1Size - biasSize - dequantSize, b1Length), - l1Status, l0Status, kbAlignValue, L1TilingType::KBL1_16), coreStatus.k); - } - l1Status.bL1Times = min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1); - GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); // tik-mm support no factor---ncheck - l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; - if (l1Status.kBL1 == coreStatus.k) { - l1Status.nBL1 = min(CalL1MaxLen(l1Status.bL1Size, l1Status, l0Status, kbAlignValue, L1TilingType::N_BL1), - l1Status.maxNBL1); - GetNearestFactor(nRepeat, l1Status.nBL1); - } - - const bool invalidL1Status = (l1Status.nBL1 == 0 || l1Status.kBL1 == 0) ? true : false; - const int32_t possibleMRepeat = (l1Status.kBL1 == coreStatus.k) ? 1 : mRepeat; - // m+n*m_repeat XXX---ncheck - l1Status.loadSize = invalidL1Status ? - INT32_MAX : - ((tilingIns_->aType_.pos == TPosition::TSCM ? 0 : coreStatus.m) + possibleMRepeat * coreStatus.n); - if (g_tempCfg.l1DB == DB_ON && l1Status.kBL1 == coreStatus.k && l1Status.nBL1 * l0Status.nL0 == coreStatus.n) { - l1Status.dbBL1 = DB_OFF; - } - res[IDX_ONE][IDX_ZERO] = l1Status.kAL1; - res[IDX_ONE][IDX_ONE] = l1Status.mAL1; - res[IDX_ONE][IDX_TWO] = l1Status.dbAL1; - res[IDX_ONE][IDX_THREE] = l1Status.kBL1; - res[IDX_ONE][IDX_FOUR] = l1Status.nBL1; - res[IDX_ONE][IDX_FIVE] = l1Status.dbBL1; - res[IDX_ONE][IDX_SIX] = l1Status.loadSize; - } -} - -void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const -{ - // if a martix in L1, then a matrix must full load, goto a matrix full load patch - if (tilingIns_->aType_.pos == TPosition::TSCM) { - return; - } - const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); - const int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); - int32_t kaAlignValue = 1; - int32_t kbAlignValue = 1; - GetABL1KAlignValue(kaAlignValue, kbAlignValue); - l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; - const int32_t curL1Size = GetL1Size(l1Status, l0Status); - const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); - if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && b1Length < tilingIns_->bufferPool_.ubSize) { - l1Status.bL1FullLoad = true; - l1Status.bL1Size = - max(MathUtil::Align(coreStatus.k, kbAlignValue), MathUtil::Align(l1Status.kBL1, kbAlignValue)) * - max(l1Status.nBL1 * l0Status.nL0, coreStatus.n) * C0_SIZE * C0_BYTE_SIZE; - if (tilingIns_->bType_.pos == TPosition::TSCM) { - l1Status.aL1Size = tilingIns_->bufferPool_.l1Size; - } else { - l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; - } - if (g_tempCfg.l1DB == DB_ON) { - l1Status.dbAL1 = DB_ON; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { - l1Status.dbAL1 = DB_OFF; - } - } - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - const int32_t biasSize = - l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; - l1Status.kAL1 = min(CalL1MaxLen((l1Status.aL1Size - biasSize - dequantSize), l1Status, l0Status, kaAlignValue, - L1TilingType::KAL1_16), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.dbAL1 = DB_OFF; - const int32_t a1Length = tilingIns_->bufferPool_.ubSize - b1Length; - l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), - l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); - } - l1Status.aL1Times = min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1); - GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor---ncheck - l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; - if (l1Status.kAL1 == coreStatus.k) { - l1Status.mAL1 = - min(CalL1MaxLen(l1Status.aL1Size - biasSize, l1Status, l0Status, kaAlignValue, L1TilingType::M_AL1), - l1Status.maxMAL1); - GetNearestFactor(mRepeat, l1Status.mAL1); // tik-mm support no factor---ncheck - } - - const bool invalidL1Status = (l1Status.mAL1 == 0 || l1Status.kAL1 == 0) ? true : false; - const int32_t possibleNRepeat = (l1Status.kAL1 == coreStatus.k) ? 1 : nRepeat; // no repeat---ncheck - l1Status.loadSize = invalidL1Status ? - INT32_MAX : - ((tilingIns_->bType_.pos == TPosition::TSCM ? 0 : coreStatus.n) + possibleNRepeat * coreStatus.m); - if (g_tempCfg.l1DB == DB_ON && l1Status.kAL1 == coreStatus.k && l1Status.mAL1 * l0Status.mL0 >= coreStatus.m) { - l1Status.dbAL1 = DB_OFF; - } - res[IDX_TWO][IDX_ZERO] = l1Status.kAL1; - res[IDX_TWO][IDX_ONE] = l1Status.mAL1; - res[IDX_TWO][IDX_TWO] = l1Status.dbAL1; - res[IDX_TWO][IDX_THREE] = l1Status.kBL1; - res[IDX_TWO][IDX_FOUR] = l1Status.nBL1; - res[IDX_TWO][IDX_FIVE] = l1Status.dbBL1; - res[IDX_TWO][IDX_SIX] = l1Status.loadSize; - } -} - -void MatmulTilingAlgorithm::L1StatusBothFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const -{ - l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; - l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; - const int32_t curL1Size = GetL1Size(l1Status, l0Status); - const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); - const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); - if (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM) { - l1Status.mAL1 = 1; - l1Status.nBL1 = 1; - } - if (((curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) && - (a1Length + b1Length) <= tilingIns_->bufferPool_.ubSize) || - (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM)) { - l1Status.bothFullLoad = true; - l1Status.loadSize = (tilingIns_->aType_.pos == TPosition::TSCM ? 0 : coreStatus.m) + - (tilingIns_->bType_.pos == TPosition::TSCM ? 0 : coreStatus.n); - res[IDX_ZERO][IDX_ZERO] = l1Status.kAL1; - res[IDX_ZERO][IDX_ONE] = l1Status.mAL1; - res[IDX_ZERO][IDX_TWO] = l1Status.dbAL1; - res[IDX_ZERO][IDX_THREE] = l1Status.kBL1; - res[IDX_ZERO][IDX_FOUR] = l1Status.nBL1; - res[IDX_ZERO][IDX_FIVE] = l1Status.dbBL1; - res[IDX_ZERO][IDX_SIX] = l1Status.loadSize; - } -} -void MatmulTilingAlgorithm::NeitherFullLoadDb(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, const int32_t& kbl1Db) const -{ - const int32_t tmpKbl116 = l1Status.kBL1; - l1Status.kBL1 = kbl1Db; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || - (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize) { - l1Status.dbBL1 = DB_OFF; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || - GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { - l1Status.dbAL1 = DB_OFF; - } - } - l1Status.kBL1 = coreStatus.k; - const bool bothDoubleBuffer = coreStatus.m != l0Status.mL0 && coreStatus.k > l0Status.kL0 && - (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || - (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize); - l1Status.kBL1 = tmpKbl116; - if (bothDoubleBuffer) { - l1Status.dbAL1 = DB_ON; - l1Status.dbBL1 = DB_ON; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || - (GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status)) > tilingIns_->bufferPool_.ubSize) { - l1Status.dbBL1 = DB_OFF; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || - GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { - l1Status.dbAL1 = DB_OFF; - } - } - } -} - -void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status) const -{ - const int32_t mRepeat = MathUtil::CeilDivision(coreStatus.m, l0Status.mL0); - int32_t nRepeat = MathUtil::CeilDivision(coreStatus.n, l0Status.nL0); - if (l0Status.dtypeBias == FP32_BYTES && l1Status.channelWiseTimes > 0) { - l1Status.channelWiseTimes++; - } - int32_t biasSize = l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * FP16_BYTES * l1Status.dbBL1; - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - // Align value is used in FP32 in FP32 out data flow mode - int32_t kaAlignValue = 1; - int32_t kbAlignValue = 1; - GetABL1KAlignValue(kaAlignValue, kbAlignValue); - L1StatusPack l1Mfirst; - L1StatusPack l1Nfirst; - errno_t err = - memcpy_s(static_cast(&l1Mfirst), sizeof(l1Mfirst), static_cast(&l1Status), sizeof(l1Mfirst)); - if (err != EOK) { - TILING_LOG_ERROR("memcpy error"); - return; - } - err = memcpy_s(static_cast(&l1Nfirst), sizeof(l1Nfirst), static_cast(&l1Status), sizeof(l1Nfirst)); - if (err != EOK) { - TILING_LOG_ERROR("memcpy error"); - } - // default l1Status.nBL1 = 1 - // calculate M first condition - l1Mfirst.bL1Size = MathUtil::Align(l1Mfirst.kBL1, kbAlignValue) * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * - l1Mfirst.dbBL1; - l1Mfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.bL1Size; - int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Mfirst, l0Status); - l1Mfirst.mAL1 = max(min(min( - CalL1MaxLen(l1Mfirst.aL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), - l1Mfirst.maxMAL1), - mRepeat), - 1); - if (IsUbNd2Nz()) { - l1Mfirst.mAL1 = max(min(min( - CalL1MaxLen(min(l1Mfirst.aL1Size - biasSize - dequantSize, a1Length), l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), - l1Mfirst.maxMAL1), - mRepeat), - 1); - } - GetNearestFactor(mRepeat, l1Mfirst.mAL1); // tik-mm support no factor ----ncheck - l1Mfirst.aL1Size = MathUtil::Align(l1Mfirst.kAL1, kaAlignValue) * l1Mfirst.mAL1 * l0Status.mL0 * C0_SIZE * - C0_BYTE_SIZE * l1Mfirst.dbAL1; - l1Mfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.aL1Size; - int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Mfirst, l0Status); - l1Mfirst.nBL1 = max(min(min( - CalL1MaxLen(l1Mfirst.bL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), - l1Mfirst.maxNBL1), - nRepeat), - 1); - if (IsUbNd2Nz()) { - l1Mfirst.nBL1 = max(min(min( - CalL1MaxLen(min(l1Mfirst.bL1Size - biasSize - dequantSize, b1Length), l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), - l1Mfirst.maxNBL1), - nRepeat), - 1); - } - GetNearestFactor(nRepeat, l1Mfirst.nBL1); - l1Mfirst.loadSize = - coreStatus.m + coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); - - // calculate N first condition - l1Nfirst.aL1Size = MathUtil::Align(l1Nfirst.kAL1, kaAlignValue) * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * - l1Nfirst.dbAL1; - l1Nfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.aL1Size; - b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Nfirst, l0Status); - l1Nfirst.nBL1 = max(min(min( - CalL1MaxLen(l1Nfirst.bL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), - l1Nfirst.maxNBL1), - nRepeat), - 1); - if (IsUbNd2Nz()) { - l1Nfirst.nBL1 = max(min(min( - CalL1MaxLen(min(l1Nfirst.bL1Size - biasSize - dequantSize, b1Length), l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), - l1Nfirst.maxNBL1), - nRepeat), - 1); - } - GetNearestFactor(nRepeat, l1Nfirst.nBL1); - l1Nfirst.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Nfirst.nBL1 * l0Status.nL0 * C0_SIZE * - C0_BYTE_SIZE * l1Nfirst.dbBL1; - l1Nfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.bL1Size; - a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); - biasSize = biasSize * l1Nfirst.nBL1; - l1Nfirst.mAL1 = max(min(min( - CalL1MaxLen(l1Nfirst.aL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), - l1Nfirst.maxMAL1), - mRepeat), - 1); - if (IsUbNd2Nz()) { - l1Nfirst.mAL1 = max(min(min( - CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), - l1Nfirst.maxMAL1), - mRepeat), - 1); - } - GetNearestFactor(mRepeat, l1Nfirst.mAL1); - l1Nfirst.loadSize = - coreStatus.m * MathUtil::CeilDivision(coreStatus.n, l1Nfirst.nBL1 * l0Status.nL0) + coreStatus.n; - - if (l1Status.kAL1 >= coreStatus.k && l1Status.kBL1 >= coreStatus.k) { - if (l1Nfirst.loadSize > l1Mfirst.loadSize) { - const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Mfirst, sizeof(l1Status)); - if (errnoT != EOK) { - TILING_LOG_ERROR("memcpy error"); - return; - } - } else { - const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Nfirst, sizeof(l1Status)); - if (errnoT != EOK) { - TILING_LOG_ERROR("memcpy error"); - return; - } - } - } - if (l1Status.kAL1 >= coreStatus.k && l1Status.kBL1 < coreStatus.k) { - l1Mfirst.nBL1 = 1; - const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Mfirst, sizeof(l1Status)); - if (errnoT != EOK) { - TILING_LOG_ERROR("memcpy error"); - return; - } - } - if (l1Status.kAL1 < coreStatus.k && l1Status.kBL1 >= coreStatus.k) { - l1Nfirst.mAL1 = 1; - const errno_t errnoT = memcpy_s(&l1Status, sizeof(l1Status), &l1Nfirst, sizeof(l1Status)); - if (errnoT != EOK) { - TILING_LOG_ERROR("memcpy error"); - return; - } - } - if (l1Status.kAL1 < coreStatus.k && l1Status.kBL1 < coreStatus.k) { - l1Status.mAL1 = 1; - l1Status.nBL1 = 1; - l1Status.loadSize = coreStatus.m * MathUtil::CeilDivision(coreStatus.n, l1Mfirst.nBL1 * l0Status.nL0) + - coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); - } -} - -void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status) const -{ - l1Status.kBL1 = coreStatus.k; - const int32_t biasSize = - l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - int32_t kaAlignValue = 1; - int32_t kbAlignValue = 1; - GetABL1KAlignValue(kaAlignValue, kbAlignValue); - - if (GetL1Size(l1Status, l0Status) > 0 && GetL1Size(l1Status, l0Status) <= tilingIns_->bufferPool_.l1Size) { - l1Status.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * - C0_BYTE_SIZE * l1Status.dbBL1; - l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; - int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); - l1Status.kAL1 = min(CalL1MaxLen(l1Status.aL1Size - biasSize - dequantSize, l1Status, l0Status, kaAlignValue, - L1TilingType::KAL1_16), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), l1Status, l0Status, kaAlignValue, - L1TilingType::KAL1_16), - coreStatus.k); - } - - l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); - GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); - l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; - } else { - // when NeitherFullLoadMN change the nBL1 and mAL1 - int32_t perK = min((tilingIns_->bufferPool_.l1Size - biasSize - dequantSize) / - (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + - C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / - l0Status.kL0 * l0Status.kL0, - coreStatus.k); - if (IsUbNd2Nz()) { - perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, - tilingIns_->bufferPool_.ubSize) / - (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + - C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / - l0Status.kL0 * l0Status.kL0, - coreStatus.k); - } - const int32_t biasFactor = tilingIns_->isBias ? l1Status.nBL1 * l0Status.nL0 : 0; - const int32_t aAlignedPerK = MathUtil::Align(perK, kaAlignValue); - const int32_t bAlignedPerK = MathUtil::Align(perK, kbAlignValue); - if (tilingIns_->aType_.dataType == DataType::DT_FLOAT && - !CheckL1Size(l1Status.mAL1 * l0Status.mL0 * aAlignedPerK * l1Status.dbAL1, - l1Status.nBL1 * l0Status.nL0 * bAlignedPerK * l1Status.dbBL1, - biasFactor * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1 + dequantSize)) { - perK -= 1; - } - int32_t perTimes = min(perK / l0Status.kL0, max(l1Status.maxKAL1, l1Status.maxKBL1)); - GetNearestFactor(l1Status.allTimes, perTimes); - perTimes = min(perTimes, l1Status.allTimes); - perK = perTimes * l0Status.kL0; - l1Status.kAL1 = perK; - l1Status.kBL1 = perK; - } -} - -bool MatmulTilingAlgorithm::CheckL1Size(int32_t amat, int32_t bmat, int32_t curBiasL1Size) const -{ - const int64_t loadSizeBytes = (static_cast(amat + bmat) * C0_SIZE * C0_BYTE_SIZE + - static_cast(curBiasL1Size)); - return loadSizeBytes <= tilingIns_->bufferPool_.l1Size; -} - -void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, const int32_t& kMaxAxis) const -{ - int32_t biasSize = - l1Status.channelWiseTimes * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l0Status.dtypeBias * l1Status.dbBL1; - int32_t dequantSize = 0; - if (tilingIns_->deqType == DequantType::TENSOR) { - dequantSize = l1Status.nBL1 * l0Status.nL0 * C0_SIZE * UINT64_TYPES; - } - int32_t alignValue = FP32_ALIGN_SIZE; - if (tilingIns_->aType_.dataType == DataType::DT_INT8) { - alignValue = INT8_ALIGN_SIZE; - } else if (tilingIns_->aType_.dataType == DataType::DT_INT4) { - alignValue = INT4_ALIGN_SIZE; - } - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - const int32_t alignM = MathUtil::CeilDivision(l1Status.mAL1 * C0_SIZE, alignValue) * alignValue; - const int32_t alignN = MathUtil::CeilDivision(l1Status.nBL1 * C0_SIZE, alignValue) * alignValue; - const int32_t alignK = MathUtil::CeilDivision(l0Status.kL0 * reduceSize, alignValue) * alignValue * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - if (kMaxAxis == 1) { - // first get k_al1, second get k_bl1 - l1Status.kBL1 = l0Status.kL0; - if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || - (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8)) { - l1Status.bL1Size = l1Status.kBL1 * l0Status.nL0 * C0_SIZE * alignK * l1Status.nBL1 * l1Status.dbBL1; - } else if (!tilingIns_->bType_.isTrans && (tilingIns_->bType_.dataType == DataType::DT_INT8 - || tilingIns_->bType_.dataType == DataType::DT_INT4)) { - l1Status.bL1Size = l1Status.kBL1 * l0Status.nL0 * alignK * alignN * l1Status.dbBL1; - } else { - l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; - } - l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; - int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); - auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, - coreStatus.k); - } - - l1Status.aL1Times = max(l1Status.kAL1 / l0Status.kL0, 1); - GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor ----ncheck - l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; - l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; - l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; - int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); - if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || - (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8)) { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), - coreStatus.k); - } - } else if (!tilingIns_->bType_.isTrans && (tilingIns_->bType_.dataType == DataType::DT_INT8 - || tilingIns_->bType_.dataType == DataType::DT_INT4)) { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / - (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / - (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), - coreStatus.k); - } - } else { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize)/ - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length)/ - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), - coreStatus.k);} - } - l1Status.bL1Times = max(min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1), 1); - GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); - l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; - } - if (kMaxAxis == NUM_TWO) { - // first get k_bl1, second get k_al1 - l1Status.kAL1 = l0Status.kL0; - if ((tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) || - (!tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8)) { - l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * alignK * l1Status.dbAL1; - } else if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8) { - l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; - } else { - l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; - } - - l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; - int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / - (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), - coreStatus.k); - } - l1Status.bL1Times = max(l1Status.kBL1 / l0Status.kL0, 1); - GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); - l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; - l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; - l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; - int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); - if ((tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) || - (!tilingIns_->aType_.isTrans && (tilingIns_->aType_.dataType == DataType::DT_INT8 || - tilingIns_->aType_.dataType == DataType::DT_INT4))) { - auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * alignK; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, - coreStatus.k); - } - } else if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8) { - l1Status.kAL1 = min((l1Status.aL1Size - biasSize - dequantSize) / - (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kAL1 = min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / - (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); - } - l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; - } else { - auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, - coreStatus.k); - if (IsUbNd2Nz()) { - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, - coreStatus.k); - } - } - l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); - GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); - l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; - } -} -void MatmulTilingAlgorithm::NeitherFullLoadK(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status) const -{ - if (l0Status.kL0 == coreStatus.k) { - return; - } - // 1 -> let k_al1 bigger, 2 -> let k_bl1 bigger, 0 -> no matter - int32_t kMaxAxis = 0; - if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { - kMaxAxis = 1; - } - if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { - kMaxAxis = 2; - } - if (!tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { - kMaxAxis = l0Status.mL0 > l0Status.nL0 ? 1 : 2; - } - // Not Support FP32 mode for NZ format and hardware with pre_ub - if (kMaxAxis != 0) { - NeitherFullLoadKforND(coreStatus, l0Status, l1Status, kMaxAxis); - } else { - NeitherFullLoadKforNZ(coreStatus, l0Status, l1Status); - } - if (g_tempCfg.factorSplit) { - if (l1Status.kAL1 > l1Status.kBL1 && l1Status.kAL1 % l1Status.kBL1 != 0) { - while (l1Status.kAL1 % l1Status.kBL1 != 0 || - (l1Status.kAL1 != l1Status.kBL1 && coreStatus.k % l1Status.kAL1 != 0)) { - l1Status.kAL1 -= 1; - } - } - if (l1Status.kAL1 < l1Status.kBL1 && l1Status.kBL1 % l1Status.kAL1 != 0) { - while (l1Status.kBL1 % l1Status.kAL1 != 0 || - (l1Status.kAL1 != l1Status.kBL1 && coreStatus.k % l1Status.kBL1 != 0)) { - l1Status.kBL1 -= 1; - } - } - } -} - -void MatmulTilingAlgorithm::L1StatusNeitherFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, - L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const -{ - // if b martix in L1, then b matrix must full load, skip non-full process - if (tilingIns_->aType_.pos == TPosition::TSCM || tilingIns_->bType_.pos == TPosition::TSCM) { - return; - } - if (g_tempCfg.l1DB == DB_ON) { - NeitherFullLoadDb(coreStatus, l0Status, l1Status, DB_ON); - } - NeitherFullLoadK(coreStatus, l0Status, l1Status); - NeitherFullLoadMN(coreStatus, l0Status, l1Status); - - res[IDX_THREE][IDX_ZERO] = l1Status.kAL1; - res[IDX_THREE][IDX_ONE] = l1Status.mAL1; - res[IDX_THREE][IDX_TWO] = l1Status.dbAL1; - res[IDX_THREE][IDX_THREE] = l1Status.kBL1; - res[IDX_THREE][IDX_FOUR] = l1Status.nBL1; - res[IDX_THREE][IDX_FIVE] = l1Status.dbBL1; - res[IDX_THREE][IDX_SIX] = l1Status.loadSize; -} - -void MatmulTilingAlgorithm::GetL1Factors(const std::string& opType, const MatmulRunParas& param, - const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status) const -{ - (void)(opType); - (void)(param); - // get mAL1, nBL1, kAL1, kBL1 factors when L0, singlecore factor is know - // get al1, bl1 double buffer factors - const int32_t mte1Loop = MIN_MTE1_LOAD / - ((l0Status.nL0 == 1 ? 1 : l0Status.kL0) + (l0Status.kL0 == 1 ? 1 : l0Status.mL0)); - int32_t res[IDX_FOUR][IDX_SEVEN] = {0}; - l1Status.allTimes = MathUtil::CeilDivision(coreStatus.k, l0Status.kL0); - l1Status.maxMAL1 = (coreStatus.m + l0Status.mL0 - 1) / l0Status.mL0; - l1Status.maxNBL1 = (coreStatus.n + l0Status.nL0 - 1) / l0Status.nL0; - l1Status.maxKAL1 = - max(mte1Loop, ((MIN_MTE1_LOAD + l0Status.mL0 - 1) / l0Status.mL0 + l0Status.kL0 - 1) / l0Status.kL0); - l1Status.maxKBL1 = - max(mte1Loop, ((MIN_MTE1_LOAD + l0Status.nL0 - 1) / l0Status.nL0 + l0Status.kL0 - 1) / l0Status.kL0); - if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - l1Status.channelWiseTimes++; - } - // both AL1 and Bl1 full load - int32_t bothFullLoadFactors[L1_FACTORS_LEN] = { coreStatus.k, coreStatus.k, l1Status.maxMAL1, - l1Status.maxNBL1, DB_OFF, DB_OFF }; - // Need to consider L1 extension in FP32 Mode - l1Status.SetStatus(bothFullLoadFactors); - L1StatusBothFullLoad(coreStatus, l0Status, l1Status, res); - // only AL1 full load - int32_t al1FullLoadFactors[L1_FACTORS_LEN] = {coreStatus.k, l0Status.kL0, l1Status.maxMAL1, 1, DB_OFF, DB_OFF}; - l1Status.SetStatus(al1FullLoadFactors); - L1StatusAl1FullLoad(coreStatus, l0Status, l1Status, res); - // only BL1 full load - int32_t bl1FullLoadFactors[L1_FACTORS_LEN] = {l0Status.kL0, coreStatus.k, 1, l1Status.maxNBL1, DB_OFF, DB_OFF}; - l1Status.SetStatus(bl1FullLoadFactors); - L1StatusBl1FullLoad(coreStatus, l0Status, l1Status, res); - // neither AL1 nor Bl1 full load - res[IDX_THREE][IDX_SIX] = INT_MAX; - int32_t neitherFullLoadFactors[L1_FACTORS_LEN] = {l0Status.kL0, l0Status.kL0, 1, 1, DB_ON, DB_ON}; - l1Status.SetStatus(neitherFullLoadFactors); - L1StatusNeitherFullLoad(coreStatus, l0Status, l1Status, res); - // choose the final factors - int32_t* tmpFactors = res[IDX_THREE]; - int32_t tmpLoadSize = tmpFactors[IDX_SIX]; - int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - const int32_t kAl1FactorOne = res[IDX_ONE][IDX_ZERO] > 0 ? MathUtil::CeilDivision( - MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ONE][IDX_ZERO])) : - 1; - const int32_t kBl1FactorTwo = res[IDX_TWO][IDX_THREE] > 0 ? MathUtil::CeilDivision( - MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_TWO][IDX_THREE])) : - 1; - const int32_t kAl1FactorZero = res[IDX_ZERO][IDX_ZERO] > 0 ? MathUtil::CeilDivision( - MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ZERO][IDX_ZERO])) : - 1; - const int32_t kBl1FactorZero = res[IDX_ZERO][IDX_THREE] > 0 ? MathUtil::CeilDivision( - MathUtil::CeilDivision(GetSingleK(), reduceSize), (coreStatus.kDim * res[IDX_ZERO][IDX_THREE])) : - 1; - - const bool al1FullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? - (l1Status.aL1FullLoad && kAl1FactorOne == 1) : - l1Status.aL1FullLoad; - const bool bl1FullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? - (l1Status.bL1FullLoad && kBl1FactorTwo == 1) : - l1Status.bL1FullLoad; - const bool bothFullLoad = (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->bType_.type == CubeFormat::ND) ? - (l1Status.bothFullLoad && kAl1FactorZero == 1 && kBl1FactorZero == 1) : - l1Status.bothFullLoad; - if (al1FullLoad && (res[IDX_ONE][IDX_SIX] < tmpLoadSize || (res[IDX_ONE][IDX_SIX] == tmpLoadSize && - res[IDX_ONE][IDX_ONE] + res[IDX_ONE][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { - tmpFactors = res[IDX_ONE]; - tmpLoadSize = tmpFactors[IDX_SIX]; - TILING_LOG_DEBUG("Select Mode One."); - } - if (bl1FullLoad && (res[IDX_TWO][IDX_SIX] < tmpLoadSize || (res[IDX_TWO][IDX_SIX] == tmpLoadSize && - res[IDX_TWO][IDX_ONE] + res[IDX_TWO][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { - tmpFactors = res[IDX_TWO]; - tmpLoadSize = tmpFactors[IDX_SIX]; - TILING_LOG_DEBUG("Select Mode Two."); - } - if (bothFullLoad && (res[IDX_ZERO][IDX_SIX] < tmpLoadSize || (res[IDX_ZERO][IDX_SIX] == tmpLoadSize && - res[IDX_ZERO][IDX_ONE] + res[IDX_ZERO][IDX_FOUR] >= tmpFactors[IDX_ONE] + tmpFactors[IDX_FOUR]))) { - tmpFactors = res[IDX_ZERO]; - TILING_LOG_DEBUG("Select Mode Zero."); - } - int32_t resL1Factors[L1_FACTORS_LEN] = {tmpFactors[IDX_ZERO], tmpFactors[IDX_THREE], tmpFactors[IDX_ONE], - tmpFactors[IDX_FOUR], tmpFactors[IDX_TWO], tmpFactors[IDX_FIVE]}; - l1Status.SetStatus(resL1Factors); -} - -void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32_t& ubSize, - int32_t a1LengthCache, int32_t b1LengthCache) const -{ - const uint32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - const uint32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); - const uint32_t cTypeSize = DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); - const uint32_t biasTypeSize = DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - - const int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * aTypeSize / BITS_PER_BYTE; - const int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * bTypeSize / BITS_PER_BYTE; - const int32_t c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * FP32_BYTES; - - if (tilingIns_->aType_.pos != TPosition::TSCM) { - l1Size += tilingIns_->tiling_.get_depthA1() * a1Length; - if (tilingIns_->enableL1CacheUB && tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - l1Size += tilingIns_->tiling_.get_depthAL1CacheUB() * a1LengthCache; - } - } - if (tilingIns_->bType_.pos != TPosition::TSCM) { - l1Size += tilingIns_->tiling_.get_depthB1() * b1Length; - if (tilingIns_->enableL1CacheUB && tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - l1Size += tilingIns_->tiling_.get_depthBL1CacheUB() * b1LengthCache; - } - } - - l0cSize += c1Length; - - if (static_cast(tilingIns_->tiling_.get_isBias())) { - if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && - tilingIns_->biasType_.pos != TPosition::TSCM) { - // for ascend910b1 bias: gm -> l1 -> bt, need extra l1 space, support bias transform - l1Size += tilingIns_->tiling_.get_baseN() * biasTypeSize; - } - } - - // in v100/v200, nd2nz and nz2nd was simulated with intrins, need extra ub space - // in V300, nd2nz was simulated with intrins, need extra ub space - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - // case2: input ND(GM/VECCALC), ND -> NZ transform, for now A/B reuse, only process with tail block, need UB space - // (1) input GM, format is ND, need do zero-fill to non-aligned tail block in ub - // (2) input VECCALC, format is ND, need do zero-fill to non-aligned tail block in ub - int32_t aUbLength = 0; - int32_t bUbLength = 0; - if (!tilingIns_->aType_.isTrans && ((tilingIns_->tiling_.get_singleCoreK() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { - aUbLength = tilingIns_->tiling_.get_baseM() * C0_BYTE_SIZE; - } - if (tilingIns_->aType_.isTrans && - ((tilingIns_->tiling_.get_singleCoreM() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { - aUbLength = tilingIns_->tiling_.get_baseK() * C0_BYTE_SIZE; - } - - if (!tilingIns_->bType_.isTrans && ((tilingIns_->tiling_.get_singleCoreN() * bTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { - bUbLength = tilingIns_->tiling_.get_baseK() * C0_BYTE_SIZE; - } - if (tilingIns_->bType_.isTrans && - ((tilingIns_->tiling_.get_singleCoreK() * bTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { - bUbLength = tilingIns_->tiling_.get_baseN() * C0_BYTE_SIZE; - } - if (tilingIns_->aType_.pos == TPosition::TSCM) { - aUbLength = 0; - } - if (tilingIns_->bType_.pos == TPosition::TSCM) { - bUbLength = 0; - } - - if ((tilingIns_->aType_.type == CubeFormat::ND || tilingIns_->bType_.type == CubeFormat::ND)) { - ubSize += max(aUbLength, bUbLength); - } - - // V300 only needs extra buffer when INPUT are at GM/UB. - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - return; - } - - // case3: output GM/VECCAL, format is ND, for now not re-use input and ouput non-aligned, is related with db open - // (1) output GM, format is NZ/ND_ALIGN/ND, need restore in ub, ND and D is non-aligned , then add more 32B, ub->gm NZ->ND format and data type trnasform - // (2) output VECCALC,format is ND_ALIGN/ND(D alined), need doNZ->ND transform in ub - if (tilingIns_->cType_.pos == TPosition::GM) { - ubSize += tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseN() * cTypeSize; - if (tilingIns_->cType_.type == CubeFormat::ND && - (tilingIns_->tiling_.get_singleCoreN() * cTypeSize) % C0_BYTE_SIZE != 0) { - ubSize += C0_BYTE_SIZE; - } - } - if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type != CubeFormat::NZ) { - ubSize += tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseN() * cTypeSize; - } - // for V200/V100, if setquanttensor and output is not nd, need extra ubsize for copy tensor from gm to ub - if (tilingIns_->deqType == DequantType::TENSOR && tilingIns_->cType_.type == CubeFormat::NZ) { - ubSize += static_cast(tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(DataType::DT_UINT64)); - } - } - return; -} - -void MatmulTilingAlgorithm::GetBankConflictSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status, - int32_t& length, bool isAMatrix) const -{ - constexpr int blockSize = 32; - constexpr int bankLen = 512; - bool isBankConflict = false; - int bankConflictSize = 0; - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if (isAMatrix) { - if (tilingIns_->aType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(l1Status.mAL1 * l0Status.mL0 * C0_SIZE, C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * - MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0 * reduceSize, - C0_SIZE) * blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = l0Status.mL0 * C0_SIZE * C0_SIZE * l1Status.mAL1 * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } - } else { - if (tilingIns_->bType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0 * reduceSize, - C0_SIZE) * blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = l0Status.nL0 * C0_SIZE * C0_SIZE * l1Status.nBL1 * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(l1Status.nBL1 * l0Status.nL0 * C0_SIZE, C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * - MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } - } - if (isBankConflict) { - length = length + bankConflictSize; - } -} - -void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) const -{ - constexpr int blockSize = 32; - constexpr int bankLen = 512; - bool isBankConflict = false; - int bankConflictSize = 0; - if (isAMatrix) { - if (tilingIns_->aType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepM() * tilingIns_->tiling_.get_baseM(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKa() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseM() * C0_SIZE * tilingIns_->tiling_.get_stepM() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } - } else { - if (tilingIns_->bType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseN() * C0_SIZE * tilingIns_->tiling_.get_stepN() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepN() * tilingIns_->tiling_.get_baseN(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKb() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } - } - if (isBankConflict) { - length = length + bankConflictSize; - } -} - -int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const -{ - int32_t a1Length = 0; - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if (IsUbNd2Nz()) { - // A matrix ND2NZ - if (tilingIns_->aType_.type == CubeFormat::ND) { - a1Length = l0Status.mL0 * C0_SIZE * l0Status.kL0 * reduceSize * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - a1Length = a1Length * MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l1Status.mAL1; - } - // bank conflict - GetBankConflictSize(l1Status, l0Status, a1Length, true); - } - } - return a1Length; -} - -int32_t MatmulTilingAlgorithm::GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const -{ - int32_t b1Length = 0; - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if (IsUbNd2Nz()) { - // B matrix ND2NZ - if (tilingIns_->bType_.type == CubeFormat::ND) { - b1Length = l0Status.nL0 * C0_SIZE * l0Status.kL0 * reduceSize * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - b1Length = b1Length * MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l1Status.nBL1; - } - // bank conflict - GetBankConflictSize(l1Status, l0Status, b1Length, false); - } - } - return b1Length; -} - -bool MatmulTilingAlgorithm::IsUbNd2Nz() const -{ - if (tilingIns_->enVecND2NZ && tilingIns_->mmConfigType == 1 && - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - return true; - } - return false; -} - -void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const -{ - int32_t a1Length = 0; - int32_t b1Length = 0; - int32_t c1Length = 0; - int32_t biasLength = 0; - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - // A matrix ND2NZ - if (tilingIns_->aType_.type == CubeFormat::ND) { - a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - a1Length = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); - } - // bank conflict - GetBankConflictSize(a1Length, true); - } - // B matrix ND2NZ - if (tilingIns_->bType_.type == CubeFormat::ND - || (tilingIns_->bType_.dataType == DataType::DT_INT8 && tilingIns_->bType_.type == CubeFormat::NZ && - tilingIns_->bType_.isTrans == false)) { - b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - b1Length = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); - } - // bank conflict - GetBankConflictSize(b1Length, false); - } - // C matrix NZ2ND - if (tilingIns_->cType_.type == CubeFormat::ND || tilingIns_->cType_.pos == TPosition::GM) { - c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * - DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); - } - // Bias - if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) { - biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - } - // quant tensor - if (tilingIns_->aType_.dataType == DataType::DT_INT8) { - int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t); - biasLength = max(quantLength, biasLength); - } - } - - transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); -} - -bool MatmulTilingAlgorithm::CheckBaseMN() const -{ - // check bias table - if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && - tilingIns_->isBias && (tilingIns_->baseN > MAX_BIAS_N * C0_SIZE) && tilingIns_->isSupportL0c2Out) { - return false; - } - if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { - return (tilingIns_->baseM * tilingIns_->baseN * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize && - tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); - } - if (tilingIns_->baseM != -1) { - return (tilingIns_->baseM * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize); - } - if (tilingIns_->baseN != -1) { - return (tilingIns_->baseN * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); - } - return true; -} - -int32_t MatmulTilingAlgorithm::GetIteratorOrder(const SingleCoreStatus& singleCoreStatus, const int32_t singleCoreM, - const int32_t singleCoreN, const int32_t singleCoreK) const -{ - if (tilingIns_->traverse_ != MatrixTraverse::NOSET) { - return static_cast(tilingIns_->traverse_) - 1; - } - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - const bool fullkAL1Load = - (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kAL1 * reduceSize)) > 1.0 ? false : true; - bool fullkBL1Load = - (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kBL1 * reduceSize)) > 1.0 ? false : true; - - // if KAL1 and KBL1 both can not be full loaded, then select m or n which is no matter - if (!fullkAL1Load && !fullkBL1Load) { - return static_cast(MatrixTraverse::FIRSTM) - 1; - } else if (fullkAL1Load && !fullkBL1Load) { // if KAL1 is full loaded, then select the order N fist - return static_cast(MatrixTraverse::FIRSTN) - 1; - } else if (!fullkAL1Load && fullkBL1Load) { // if KBL1 is full loaded, then select the order M fist - return static_cast(MatrixTraverse::FIRSTM) - 1; - } else { - // if AL1LoadSize less then BL1LoadSize, then select order N first, vice versa. - const int32_t mLoop = MathUtil::CeilDivision(singleCoreM, - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l0Status.mL0 * C0_SIZE); - const int32_t nLoop = MathUtil::CeilDivision(singleCoreN, - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l0Status.nL0 * C0_SIZE); - const int32_t aL1LoadSize = singleCoreM + singleCoreN * mLoop; - const int32_t bL1LoadSize = singleCoreN + singleCoreM * nLoop; - return aL1LoadSize < bL1LoadSize ? 1 : 0; - } -} - -void MatmulTilingAlgorithm::UpdateBlockDimCalculator(BlockDimCalculator& blockDimRes) const -{ - if (blockDimRes.totalLoadSize > blockDimRes.tmpLoadSize) { - blockDimRes.bmatSize = blockDimRes.tmpBmatSize; - blockDimRes.amatSize = blockDimRes.tmpAmatSize; - blockDimRes.totalLoadSize = blockDimRes.tmpLoadSize; - blockDimRes.tmpValue = 0; - } -} - -void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes, const MatmulRunParas& params) const -{ - blockDimRes.totalLoadSize = INT_MAX; - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - const int32_t totalSize = blockDimRes.amatSize + blockDimRes.bmatSize; // batch==1 - constexpr int32_t minMNSize = 16; - constexpr int32_t minKSize = 64; - constexpr int32_t minTotalSize = 128; - const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 - const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? - min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; - const int32_t dbBuffer = 2; - - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - // loadsize = K*(N*mdim+M*ndim) - const bool bothFullLoad = static_cast(totalSize) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - const bool afulloadPlsBKFullLoad = - static_cast(blockDimRes.amatSize + n0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - const bool bfulloadPlsaKFullLoad = - static_cast(blockDimRes.bmatSize + m0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - if (afulloadPlsBKFullLoad || bfulloadPlsaKFullLoad || bothFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A kdim not fullload + B kdim not fullload(9) - // loadsize = M*K*N*(1/m0+1/n0) - const bool aKNotfulloadPlsbKNotFullLoad = - (n0 * blockDimRes.kBytes + m0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > - tilingIns_->bufferPool_.l1Size && - (m0 * blockDimRes.kBytes + n0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > - tilingIns_->bufferPool_.l1Size; - if (aKNotfulloadPlsbKNotFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A kdim fullload + B kdim fullload(5) - // M*K*(ndim+N/m1) or N*K*(mdim+M/n1) - const bool aKfulloadPlsbKFullLoad = (m0 + n0) * blockDimRes.kBytes * dbBuffer <= tilingIns_->bufferPool_.l1Size; - if (aKfulloadPlsbKFullLoad) { - const int32_t m1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - n0 * - blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * m0)) * m0; - const int32_t n1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - m0 * - blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * n0)) * n0; - const int32_t mfirstLoad = - blockDimRes.oriAmatSize * blockDims.n + blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); - int32_t nfirstLoad = - blockDimRes.oriBmatSize * blockDims.m + blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); - if (mfirstLoad < nfirstLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); - } else { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; - } - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A fullload + B Kdim not fullload or A K fullload + B Kdim not fullload(3/6) - // mdim = coreNum; ndim = 1; - // loadsize = M*K*(ndim+N/m0) - const bool afulloadPlsbKNotFullLoad = (blockDimRes.amatSize * blockDimRes.kBytes + - n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - const bool aKfulloadPlsbKNotFullLoad = (m0 * blockDimRes.kBytes * dbBuffer + - n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - if (afulloadPlsbKNotFullLoad || aKfulloadPlsbKNotFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - } - - // A kdim not fullload + B fullload or A kdim not fullload + B kdim fullload(7/8) - // loadsize = N*K*(mdim+M/n0) - const bool aKNotfulloadPlsbFullLoad = (blockDimRes.bmatSize * blockDimRes.kBytes + - m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - const bool aKNotfulloadPlsbKFullLoad = (n0 * blockDimRes.kBytes * dbBuffer + - m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - if (aKNotfulloadPlsbFullLoad || aKNotfulloadPlsbKFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriBmatSize * blockDims.m; - blockDimRes.tmpBmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - } -} - -int32_t MatmulTilingAlgorithm::LoopNumFromSingleCoreToL0(const CoreStatusPack& coreStatus, - const DimFactor& blockDimsFactor) const -{ - if (!blockDimsFactor.IsValid()) { - return 0; - } - constexpr int32_t minTotalSize = 128; - constexpr int32_t minSize = 64; - constexpr int32_t minN0Size = 16; - int32_t n0 = min(min(minN0Size, coreStatus.n), minSize); - int32_t m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); - n0 = (m0 == 0) ? 0 : min(min(coreStatus.n, minTotalSize / m0), minSize); - m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); - const int32_t k0 = (m0 != 0 && n0 != 0) ? - min(min(minSize / m0, minSize / n0), coreStatus.k) : coreStatus.k; - const int32_t loopNum = MathUtil::CeilDivision(coreStatus.m, m0) * MathUtil::CeilDivision(coreStatus.n, n0) * - MathUtil::CeilDivision(coreStatus.k, k0); - return loopNum; -} - -int32_t MatmulTilingAlgorithm::GetBigPackageCondition(const CoreStatusPack &coreStatus, - const BlockDimCalculator &blockDimRes, const MatmulRunParas ¶ms) const -{ - if (tilingIns_->bType_.isTrans == true && tilingIns_->aType_.isTrans == false) { - return ATTACH_FLAG_ZERO; - } - const int minSize = 16; - bool flag = true; - if (tilingIns_->bType_.isTrans == false) { - if (params.n32 >= minSize && coreStatus.n < minSize) { - flag = false; - } - } - if (tilingIns_->aType_.isTrans) { - if (params.m32 >= minSize && coreStatus.m < minSize) { - flag = false; - } - } - - if (!blockDimRes.bigPackage && !flag) { - return ATTACH_FLAG_ZERO; - } else if (!blockDimRes.bigPackage && flag) { - return ATTACH_FLAG_TWO; - } else if (blockDimRes.bigPackage && !flag) { - return ATTACH_FLAG_ONE; - } else { - return ATTACH_FLAG_ZERO; - } -} - -void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes, const MatmulRunParas& params) -{ - blockDimRes.kNum = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE); // contain k * 16 - blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 - coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); - coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); - coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); - coreStatus.k = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k); - if (tilingIns_->enableSplitK_) { - if (params.kMapped != params.k32) { // need check--splitK - blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; - coreStatus.k = params.kMapped / blockDim.k * NUM_TWO; - } - } - - // load size of A matrix is batch * m - // load size of B matrix is n - blockDimRes.oriAmatSize = params.batch32 * params.m32; - blockDimRes.oriBmatSize = params.oriShapeBbatch > 1 ? params.batch32 * params.n32 : params.n32; - blockDimRes.amatSize = coreStatus.batch * coreStatus.m; - blockDimRes.bmatSize = params.oriShapeBbatch > 1 ? coreStatus.batch * coreStatus.n : coreStatus.n; - blockDimRes.tmpValue = 0; - CalcLoadSize(blockDim, coreStatus, blockDimRes, params); - if (tilingIns_->enableSplitK_) { - blockDimRes.totalLoadSize *= coreStatus.k; - } - - // updateSolution: bool whether update to a new block factor solution - // has smaller LoadSize or the same LoadSize but batch - const int bigpackageFlag = GetBigPackageCondition(coreStatus, blockDimRes, params); - const bool updateConditionBp = bigpackageFlag == 0 ? false : true; - bool updateConditionBp2 = bigpackageFlag == 2 ? true : false; - bool updateConditionBp3 = bigpackageFlag == 1 ? false : true; - - const int32_t loopNum = LoopNumFromSingleCoreToL0(coreStatus, blockDim); - const bool updateConditionCoreUsed = (!updateConditionBp) && ((loopNum < blockDimRes.loopNumToL0) || - (blockDim.ReduceMul() > blockDimRes.coreUse && loopNum == blockDimRes.loopNumToL0)); - const bool updateConditionLoadsize = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse) && - blockDimRes.totalLoadSize < blockDimRes.minLoadSize; - const int32_t orgBatchM = params.oriShapeAbatch > 1 ? blockDimRes.batchDimFactor : blockDimRes.mDimFactor; - const int32_t curBatchM = params.oriShapeAbatch > 1 ? blockDim.batch : blockDim.m; - const bool updateConditionBatchNDim = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse && - blockDimRes.totalLoadSize == blockDimRes.minLoadSize) && - ((blockDimRes.nDimFactor * orgBatchM < curBatchM * blockDim.n) || - (blockDimRes.nDimFactor * orgBatchM == curBatchM * blockDim.n && - blockDimRes.batchDimFactor < blockDim.batch)); - - const bool policyCondition = - UserPolicy(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, - coreStatus, blockDimRes); - if ((updateConditionBp2 || updateConditionCoreUsed || updateConditionLoadsize || updateConditionBatchNDim) && - policyCondition && updateConditionBp3) { - blockDimRes.minLoadSize = blockDimRes.totalLoadSize; - blockDimRes.nDimFactor = blockDim.n; - blockDimRes.batchDimFactor = blockDim.batch; - blockDimRes.mDimFactor = blockDim.m; - blockDimRes.kDimFactor = blockDim.k; - blockDimRes.coreUse = blockDim.ReduceMul(); - blockDimRes.loopNumToL0 = loopNum; - blockDimRes.finalValue = blockDimRes.tmpValue; - const int32_t minSize = 16; - blockDimRes.bigPackage = (!tilingIns_->bType_.isTrans ? coreStatus.n >= minSize : true) && - (tilingIns_->aType_.isTrans ? coreStatus.m >= minSize : true) && (blockDim.n * blockDim.m * blockDim.k > 1); - splitCoreFlag_ = true; - } -} - -bool MatmulTilingAlgorithm::UserPolicy(const TilingPolicy policy, const CoreStatusPack& coreStatus, - const BlockDimCalculator& blockDimRes) const -{ - constexpr int32_t minMNSize = 16; - constexpr int32_t minKSize = 64; - constexpr int32_t minTotalSize = 128; - const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 - const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; - - if (policy == TilingPolicy::FIXED_B_TSCM) { - const int32_t alignFactor = MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE); - if (coreStatus.n < alignFactor) { - return false; - } - const int32_t alignNLength = MathUtil::Align(coreStatus.n, alignFactor); - const int32_t bMatrixSize = alignNLength * blockDimRes.kBytes * 2; - int32_t aMatrixSize = m0 * k0 * C0_SIZE * C0_BYTE_SIZE; - int32_t biasSize = 0; - if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - biasSize = alignNLength * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - } - if (bMatrixSize + aMatrixSize + biasSize <= tilingIns_->bufferPool_.l1Size) { - return true; - } else { - return false; - } - } else if (policy == TilingPolicy::FIXED_A_TSCM) { - return false; - } else if (policy == TilingPolicy::FIXED_A_B_TSCM) { - return false; - } else { - return true; - } -} - -bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreStatusPack& coreStatus, - MatmulRunParas& params, const int32_t& coreNum, bool splitKFlag) const -{ - (void)(opType); - // experience value for mini shape - const int32_t miniL0cThreshold = tilingIns_->bufferPool_.l0CSize / MIN_FRACTAL_SIZE / FP32_BYTES; - const int32_t miniL0abThreshold = tilingIns_->bufferPool_.l0ASize / (C0_SIZE * C0_BYTE_SIZE); - // tend to use less cores for shapes with batch less than coreNum and m/k/n can full load in - // aicore buffers split_k is conflict with m/n shift_inwards - bool specialScenario = false; - if (params.n32 > MIN_MTE1_LOAD) { - specialScenario = specialScenario || - (splitKFlag && ((static_cast(params.nMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); - } - if (params.m32 > MIN_MTE1_LOAD) { - specialScenario = specialScenario || - (splitKFlag && ((static_cast(params.mMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); - } - - if (params.batch32 * params.n32 * params.m32 <= coreNum && params.m32 * params.k32 <= miniL0abThreshold && - params.n32 * params.k32 <= miniL0abThreshold && params.m32 * params.n32 <= miniL0cThreshold && - !specialScenario) { - coreStatus.batchDim = params.batch32; - coreStatus.nDim = params.n32 <= MIN_MTE1_LOAD ? 1 : params.nMapped / MIN_MTE1_LOAD; - coreStatus.mDim = params.m32 <= MIN_MTE1_LOAD ? 1 : params.mMapped / MIN_MTE1_LOAD; - int32_t kDimCandidate[2] = {0}; // storage 2 factors of k around kDim - GetTwoFactors(kDimCandidate, coreStatus.kDim, params.k32, coreNum); - coreStatus.kDim = (params.k32 <= MIN_MTE1_LOAD || !splitKFlag) ? - 1 : - (kDimCandidate[1] > 1 ? kDimCandidate[1] : kDimCandidate[0]); - coreStatus.batch = 1; - coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); - coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); - coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); - params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); - return true; - } - return false; -} -float MatmulTilingAlgorithm::CalculateBlockCycles(int32_t baseM, int32_t baseN, int32_t baseK) const -{ - const int32_t reduceBlockSize = C0_BYTE_SIZE * BITS_PER_BYTE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - return static_cast(baseM * baseN * baseK) / (C0_SIZE * C0_SIZE * reduceBlockSize); -} - -int32_t MatmulTilingAlgorithm::CalculateMemoryTraffic(int32_t baseM, int32_t baseN, int32_t baseK) const -{ - int32_t aMatrixSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - int32_t bMatrixSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - return aMatrixSize + bMatrixSize; -} - -bool MatmulTilingAlgorithm::AlignSingleShape(bool needAlign, int32_t orgShape, int32_t factor, int32_t alignSize, - int32_t &singleShape) const -{ - singleShape = MathUtil::CeilDivision(orgShape, factor); - if (!needAlign || alignSize == 0 || orgShape % alignSize != 0) { - return true; // orgShape not align, don't need to adjust - } - if (factor <= 1) { - return true; - } - int32_t maxSingleShape = MathUtil::CeilDivision(orgShape, factor - 1); - int32_t alignSingleShape = MathUtil::Align(singleShape, alignSize); - if (alignSingleShape >= maxSingleShape) { - return false; - } - singleShape = alignSingleShape; - return true; -} - -ComputeBaseBlock MatmulTilingAlgorithm::GetMultiCoreBasicBlock(const MatmulRunParas& params) const -{ - (void)params; - constexpr static int32_t l0c256KB = 262144; - constexpr static int32_t basicSize128 = 128; - constexpr static int32_t basicSize256 = 256; - int32_t basicM = basicSize128; - if (tilingIns_->bufferPool_.l0CSize == l0c256KB) { - basicM = basicSize256; - } - int32_t basicN = basicSize256; - int32_t aDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) != 0 ? - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) : 1; - int32_t basicK = basicSize128 * BITS_PER_BYTE / aDtypeSize; - ComputeBaseBlock basicBlock {basicM, basicN, basicK}; - // SetFixSplit - if (tilingIns_->baseM != -1) { - basicBlock.baseM = tilingIns_->baseM; - } - if (tilingIns_->baseN != -1) { - basicBlock.baseN = tilingIns_->baseN; - } - if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { - return basicBlock; - } - if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { - basicBlock.baseM = tilingIns_->baseM != -1 ? basicBlock.baseM : basicSize256; - basicBlock.baseN = tilingIns_->baseN != -1 ? basicBlock.baseN : basicSize128; - return basicBlock; - } - - return basicBlock; -} - -float MatmulTilingAlgorithm::CalcBaseBlockBandRatio(int32_t mDim, int32_t nDim, const ComputeBaseBlock &baseBlock) const -{ - float bandRatio = static_cast((numOfBlock_ - mDim) * baseBlock.baseM + (numOfBlock_ - nDim) * baseBlock.baseN) / - static_cast((baseBlock.baseM + baseBlock.baseN) * numOfBlock_); - return bandRatio; -} - -ComputeIntensity MatmulTilingAlgorithm::CalcComputeIntensity(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, - const std::pair &factor) const -{ - auto mFactor = factor.first; - auto nFactor = factor.second; - int32_t sm = 0; - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - bool alignSuccA = AlignSingleShape(tilingIns_->aType_.isTrans, params.oriShapeM, mFactor, aAlignSize, sm); - int32_t sn = 0; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool alignSuccB = AlignSingleShape(!tilingIns_->bType_.isTrans, params.oriShapeN, nFactor, bAlignSize, sn); - auto shapeM = MathUtil::DivideIntoMainAndTail(sm, baseBlock.baseM); - auto shapeN = MathUtil::DivideIntoMainAndTail(sn, baseBlock.baseN); - auto mainM = shapeM.first; - auto tailM = shapeM.second; - auto mainN = shapeN.first; - auto tailN = shapeN.second; - int32_t memoryRatio = (alignSuccA && alignSuccB) ? 1 : 2; - std::vector blocks; - float bandRatio = CalcBaseBlockBandRatio(mFactor, nFactor, baseBlock); - // Main Chunk - if (mainM > 0 && mainN > 0) { - int count = mainM * mainN; - float cycles = CalculateBlockCycles(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; - int32_t memory = memoryRatio * - CalculateMemoryTraffic(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; - blocks.push_back({count, cycles, memory}); - } - // N Tail Chunk - if (mainM > 0 && tailN > 0) { - float cycles = CalculateBlockCycles(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; - int32_t memory = memoryRatio * CalculateMemoryTraffic(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; - blocks.push_back({mainM, cycles, memory}); - } - // M Tail Chunk - if (tailM > 0 && mainN > 0) { - float cycles = CalculateBlockCycles(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; - int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; - blocks.push_back({mainN, cycles, memory}); - } - // M and N Tail Chunk - if (tailM > 0 && tailN > 0) { - float cycles = CalculateBlockCycles(tailM, tailN, baseBlock.baseK); - int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, tailN, baseBlock.baseK); - blocks.push_back({1, cycles, memory}); - } - float totalCycles = 0; - int32_t totalMemory = 0; - for (const auto& v : blocks) { - totalCycles += v.computeCycle; - totalMemory += v.memoryTraffic; - } - return { - {mFactor, nFactor}, totalCycles, (totalMemory != 0) ? totalCycles / totalMemory : 0, bandRatio}; -} - -MultiCoreScenario MatmulTilingAlgorithm::GetMultiCoreScenario(const MatmulRunParas& params) const -{ - if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { - return MultiCoreScenario::OTHERS; - } - if (tilingIns_->enableSplitK_ || tilingIns_->singleM != -1 || tilingIns_->singleN != -1) { - return MultiCoreScenario::OTHERS; - } - constexpr int64_t mnLimit = 26214; - constexpr int64_t mLimit = 128; - if (params.oriShapeM >= mLimit && params.oriShapeM * params.oriShapeN > mnLimit * numOfBlock_) { - return MultiCoreScenario::SPLIT_MN; - } - return MultiCoreScenario::OTHERS; -} - -void MatmulTilingAlgorithm::UpdateStepK(const ComputeBaseBlock &baseBlock, int32_t &stepK) const -{ - if (stepK * baseBlock.baseK >= GetSingleK()) { - return; - } - constexpr static int32_t baseBlockSize512 = 512; - constexpr static int32_t baseBlockSize256 = 256; - int32_t aTypeBitSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize512) { - if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0) && - (baseBlockSize512 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { - while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0 && stepK > 1) { - stepK--; - } - } - } else if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize256) { - if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0) && - (baseBlockSize256 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { - while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0 && stepK > 1) { - stepK--; - } - } - } -} - -void MatmulTilingAlgorithm::CalcL1Tiling(const ComputeBaseBlock &baseBlock, int32_t &depthA1, int32_t &depthB1, - int32_t &stepKa, int32_t &stepKb) const -{ - int32_t l1Size = tilingIns_->bufferPool_.l1Size; - constexpr static int32_t reservedL1Size = 256; // l1 reserved 256B - int32_t depthA1Size = (l1Size / DB_ON / baseBlock.baseM / baseBlock.baseK) * BITS_PER_BYTE / - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - int32_t depthB1Size = ((l1Size + reservedL1Size) / DB_ON / baseBlock.baseN / baseBlock.baseK) * BITS_PER_BYTE / - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); - int32_t btSize = tilingIns_->isBias ? tilingIns_->bufferPool_.btSize / BITS_PER_BYTE : 0; - if (depthA1Size + depthB1Size > l1Size - btSize) { - if (baseBlock.baseM <= baseBlock.baseN) { - depthA1Size = depthA1Size / DB_ON; - } else { - depthB1Size = depthB1Size / DB_ON; - } - } - int32_t l1Db = g_tempCfg.l1DB == DB_OFF ? DB_OFF : DB_ON; - stepKa = depthA1Size / l1Db; - stepKb = depthB1Size / l1Db; - UpdateStepK(baseBlock, stepKa); - UpdateStepK(baseBlock, stepKb); - if (stepKa >= stepKb && stepKb != 0) { - stepKa = stepKa / stepKb * stepKb; - } else if (stepKa != 0) { - stepKb = stepKb / stepKa * stepKa; - } - depthA1 = stepKa * l1Db; - depthB1 = stepKb * l1Db; -} - -L0StatusPack MatmulTilingAlgorithm::GetL0CoreStatus(const ComputeBaseBlock &baseBlock) const -{ - L0StatusPack l0Status; - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - l0Status.dbL0C = g_tempCfg.l0cDB; - if (baseBlock.baseM * baseBlock.baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { - l0Status.dbL0C = DB_OFF; - } - l0Status.dbL0A = DB_ON; - l0Status.dbL0B = DB_ON; - l0Status.mL0 = baseBlock.baseM / C0_SIZE; - l0Status.kL0 = baseBlock.baseK / reduceSize; - l0Status.nL0 = baseBlock.baseN / C0_SIZE; - return l0Status; -} - -L1StatusPack MatmulTilingAlgorithm::GetL1CoreStatus(const ComputeBaseBlock &baseBlock, int32_t depthA1, int32_t depthB1, - int32_t stepKa, int32_t stepKb) const -{ - L1StatusPack l1Status; - l1Status.mAL1 = 1; - l1Status.nBL1 = 1; - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - l1Status.kAL1 = baseBlock.baseK / reduceSize * stepKa; - l1Status.kBL1 = baseBlock.baseK / reduceSize * stepKb; - l1Status.dbAL1 = depthA1 >= stepKa * DB_ON ? DB_ON : DB_OFF; - l1Status.dbBL1 = depthB1 >= stepKb * DB_ON ? DB_ON : DB_OFF; - return l1Status; -} - -void MatmulTilingAlgorithm::UpdateShapeAndLayout() const -{ - tilingIns_->tiling_.set_M(tilingIns_->orgM); - tilingIns_->tiling_.set_N(tilingIns_->orgN); - tilingIns_->tiling_.set_Ka(tilingIns_->orgKa); - tilingIns_->tiling_.set_Kb(tilingIns_->orgKb); - tilingIns_->tiling_.set_batchM(tilingIns_->batchM); - tilingIns_->tiling_.set_batchN(tilingIns_->batchN); - tilingIns_->tiling_.set_singleBatchM(tilingIns_->singleBatchM); - tilingIns_->tiling_.set_singleBatchN(tilingIns_->singleBatchN); - - tilingIns_->tiling_.set_ALayoutInfoB(tilingIns_->aLayoutInfoB); - tilingIns_->tiling_.set_ALayoutInfoS(tilingIns_->aLayoutInfoS); - tilingIns_->tiling_.set_ALayoutInfoN(tilingIns_->aLayoutInfoN); - tilingIns_->tiling_.set_ALayoutInfoG(tilingIns_->aLayoutInfoG); - tilingIns_->tiling_.set_ALayoutInfoD(tilingIns_->aLayoutInfoD); - - tilingIns_->tiling_.set_BLayoutInfoB(tilingIns_->bLayoutInfoB); - tilingIns_->tiling_.set_BLayoutInfoS(tilingIns_->bLayoutInfoS); - tilingIns_->tiling_.set_BLayoutInfoN(tilingIns_->bLayoutInfoN); - tilingIns_->tiling_.set_BLayoutInfoG(tilingIns_->bLayoutInfoG); - tilingIns_->tiling_.set_BLayoutInfoD(tilingIns_->bLayoutInfoD); - - tilingIns_->tiling_.set_CLayoutInfoB(tilingIns_->cLayoutInfoB); - tilingIns_->tiling_.set_CLayoutInfoS1(tilingIns_->cLayoutInfoS1); - tilingIns_->tiling_.set_CLayoutInfoN(tilingIns_->cLayoutInfoN); - tilingIns_->tiling_.set_CLayoutInfoG(tilingIns_->cLayoutInfoG); - tilingIns_->tiling_.set_CLayoutInfoS2(tilingIns_->cLayoutInfoS2); - tilingIns_->tiling_.set_BatchNum(tilingIns_->batchNum); - return; -} - -void MatmulTilingAlgorithm::UpdateUsedSize() const -{ - int32_t transLength = 0; - GetTransLength(transLength); - int32_t a1LengthCache = 0; - int32_t b1LengthCache = 0; - SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub - tilingIns_->tiling_.set_shareMode(0); - int32_t l1Size = 0; - int32_t l0cSize = 0; - int32_t ubSize = 0; - GetUsedSize(l1Size, l0cSize, ubSize, a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_shareL1Size(l1Size); - tilingIns_->tiling_.set_shareL0CSize(l0cSize); - tilingIns_->tiling_.set_shareUbSize(ubSize); -} - -int64_t MatmulTilingAlgorithm::AdjustOuterProductL0Factor(const SingleCoreStatus& singleCoreStatus) const -{ - if (tilingIns_->scheduleType != ScheduleType::OUTER_PRODUCT) { - return 0; - } - // check whether OUTER_PRODUCT is supported - if ((tilingIns_->tiling_.get_baseK() < tilingIns_->tiling_.get_singleCoreK()) && - ((tilingIns_->mmConfigType == 1) || ((tilingIns_->mmConfigType == 0) && - (tilingIns_->batchNum != 0)))) { - TILING_LOG_WARNING("Unsupported scheduleType is OUTER_PRODUCT"); - return -1L; - } - int32_t newBaseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t newBaseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - // when scheduleType is OUTER_PRODUCT, each iteration computes 2 * basicBlock size of data - bool isL0CFullUsed = (newBaseM * newBaseN * NUM_TWO * - static_cast(DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType))) > - static_cast(tilingIns_->bufferPool_.l0CSize) ? true : false; - if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 0)) { - // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_M, N db in L0 - newBaseN = MathUtil::Align(newBaseN / NUM_TWO, C0_SIZE); - } else if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 1)) { - // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_N, M db in L0 - newBaseM = MathUtil::Align(newBaseM / NUM_TWO, C0_SIZE); - } - tilingIns_->tiling_.set_baseM(newBaseM); - tilingIns_->tiling_.set_baseN(newBaseN); - return 0; -} - -void MatmulTilingAlgorithm::AdjustFloatL1Factor(const SingleCoreStatus& singleCoreStatus) const -{ - if (DTYPE_BYTE_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BYTE_TAB.at(DataType::DT_FLOAT)) { - if (tilingIns_->tiling_.get_baseK() == DT_FLOAT_INVALID_BASEK) { - tilingIns_->tiling_.set_stepKb(1); - tilingIns_->tiling_.set_depthB1(singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); - } - } -} - -int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) const -{ - int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : - coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; - int32_t singleCoreM; - int32_t singleCoreN; - int32_t singleCoreK; - GetSingleShape(coreStatus, param, singleCoreM, singleCoreN, singleCoreK); - if (!CheckSingleShape(singleCoreM, singleCoreN, singleCoreK)) { - return -1L; - } - tilingIns_->tiling_.set_usedCoreNum(coreUse); - tilingIns_->tiling_.set_singleCoreM(singleCoreM); - tilingIns_->tiling_.set_singleCoreN(singleCoreN); - tilingIns_->tiling_.set_singleCoreK(singleCoreK); - UpdateShapeAndLayout(); - tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE); - tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE); - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); - tilingIns_->tiling_.set_iterateOrder(GetIteratorOrder(singleCoreStatus, singleCoreM, singleCoreN, singleCoreK)); - // check whether OUTER_PRODUCT is supported - if (AdjustOuterProductL0Factor(singleCoreStatus) != 0) { - return -1L; - } - tilingIns_->baseM = tilingIns_->tiling_.get_baseM(); - tilingIns_->baseN = tilingIns_->tiling_.get_baseN(); - tilingIns_->baseK = tilingIns_->tiling_.get_baseK(); - tilingIns_->tiling_.set_depthA1( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); - tilingIns_->tiling_.set_depthB1(UpdateDepthB1(singleCoreStatus)); - // if decrease depthB1, nBL1 must decrease to ensure nBL1 is less then depthB1 - singleCoreStatus.l1Status.nBL1 = min(singleCoreStatus.l1Status.nBL1, tilingIns_->tiling_.get_depthB1()); - tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); - tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); - tilingIns_->tiling_.set_stepKa( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); - tilingIns_->tiling_.set_stepKb( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); - AdjustFloatL1Factor(singleCoreStatus); - tilingIns_->tiling_.set_isBias(tilingIns_->isBias ? 1 : 0); - tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); - tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); - tilingIns_->tiling_.set_dbL0C(singleCoreStatus.l0Status.dbL0C); - UpdateUsedSize(); - return 0; -} - -bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& params, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes) -{ - if (GetMultiCoreScenario(params) != MultiCoreScenario::SPLIT_MN) { - return false; - } - ComputeBaseBlock baseBlock = GetMultiCoreBasicBlock(params); // calc basic block - CalcMultiCoreBlockDims(params, baseBlock, coreStatus, blockDimRes); - SingleCoreStatus singleCoreStatus; - singleCoreStatus.l0Status = GetL0CoreStatus(baseBlock); - AdjustSparseL0Factors(singleCoreStatus); - int32_t depthA1; - int32_t depthB1; - int32_t stepKa; - int32_t stepKb; - CalcL1Tiling(baseBlock, depthA1, depthB1, stepKa, stepKb); - singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); - (void)UpdateTiling(params, coreStatus, singleCoreStatus); - return true; -} - -bool MatmulTilingAlgorithm::NeedOutputAlign(int32_t m, int32_t n, int32_t k) const -{ - int32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - int32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); - int32_t cTypeSize = DTYPE_BIT_TAB.at(tilingIns_->cType_.dataType); - constexpr static int32_t outputRatio = 2; - bool needAlign = static_cast(n * m) * static_cast(outputRatio * cTypeSize) > - static_cast(n * k* aTypeSize) + static_cast(m * k * bTypeSize); - return needAlign; -} - -void MatmulTilingAlgorithm::CalcMultiCoreBlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, - CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes) -{ - auto factors = MathUtil::GetFactorPairs(numOfBlock_); - std::vector results; - for (const auto& factor : factors) { - results.push_back(CalcComputeIntensity(params, baseBlock, factor)); - } - // 排序结果 - std::sort(results.begin(), results.end()); - for (auto v : results) { - TILING_LOG_DEBUG("intent:%f, cycle: %f, band: %f, mDim: %d, nDim: %d\n", - v.avgIntensity, v.computeCycle, v.bandRatio, v.dimFactor.first, v.dimFactor.second); - } - coreStatus.batchDim = 1; - blockDimRes.nDimFactor = results[0].dimFactor.second; - blockDimRes.mDimFactor = results[0].dimFactor.first; - blockDimRes.kDimFactor = 1; - coreStatus.mDim = results[0].dimFactor.first; - coreStatus.nDim = results[0].dimFactor.second; - coreStatus.kDim = 1; - const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); - const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool needOutputAlign = NeedOutputAlign(m, n, GetSingleK()); - (void)AlignSingleShape((!tilingIns_->bType_.isTrans || needOutputAlign), n, coreStatus.nDim, bAlignSize, coreStatus.n); - (void)AlignSingleShape(tilingIns_->aType_.isTrans, m, coreStatus.mDim, aAlignSize, coreStatus.m); - blockDimRes.kNum = params.k32 / coreStatus.kDim * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 - blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 - coreStatus.batch = params.batch32; - coreStatus.k = params.k32 / coreStatus.kDim; - TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d", coreStatus.m, coreStatus.n, coreStatus.k); - // load size of A matrix is batch * m - // load size of B matrix is n - DimFactor blockDim(1, blockDimRes.mDimFactor, blockDimRes.kDimFactor, blockDimRes.nDimFactor); - GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); - return; -} - -void MatmulTilingAlgorithm::UpdateMultiCore(const std::string& opType, const MatmulRunParas& params, - CoreStatusPack& coreStatus, const BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. - coreStatus.batchDim = min(MathUtil::CeilDivision(params.batch32, coreStatus.batch), numOfBlock_); - coreStatus.nDim = min(MathUtil::CeilDivision(params.n32, coreStatus.n), numOfBlock_); - coreStatus.mDim = min(MathUtil::CeilDivision(params.m32, coreStatus.m), numOfBlock_); - - if (tilingIns_->enableSplitK_) { - coreStatus.kDim = min(MathUtil::CeilDivision(params.k32, coreStatus.k), numOfBlock_); - } else { - coreStatus.kDim = blockDimRes.kDimFactor; - } - UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, - coreStatus); -} - -void MatmulTilingAlgorithm::UpdateBufferSize(const TilingPolicy policy, const CoreStatusPack& coreStatus) const -{ - if (policy == TilingPolicy::NO_POLICY) { - return; - } else if (policy == TilingPolicy::FIXED_B_TSCM) { - const int32_t bMatrixSize = - MathUtil::Align(coreStatus.n, MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE)) * coreStatus.k * - C0_SIZE * C0_BYTE_SIZE * 2; - tilingIns_->bufferPool_.l1Size -= bMatrixSize; - } else if (policy == TilingPolicy::FIXED_A_TSCM) { - const int32_t aMatrixSize = coreStatus.m * coreStatus.k * C0_SIZE * C0_BYTE_SIZE * 2; - tilingIns_->bufferPool_.l1Size -= aMatrixSize; - } else { - return; - } -} - -bool MatmulTilingAlgorithm::IsInvalidFactor(int32_t factor) const -{ - return factor > numOfBlock_ || factor <= 0; -} - -void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const MatmulRunParas& params, - BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - const int32_t coreNum = numOfBlock_; - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); - if (mnCore > 1) { - const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); - const int32_t mdim = static_cast(ceil(optPoint)); - const int32_t ndim = static_cast(ceil(mnCore / optPoint)); - MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); - MathUtil::AddFactor(blockDimRes.mDimFactors, ndim == 0 ? 1 : mnCore / ndim); - MathUtil::AddFactor(blockDimRes.nDimFactors, ndim); - MathUtil::AddFactor(blockDimRes.nDimFactors, mdim == 0 ? 1 : mnCore / mdim); - } -} - -void MatmulTilingAlgorithm::GenBlockDimsMapFactors(const std::string& opType, MatmulRunParas& params, - BlockDimCalculator& blockDimRes) const -{ - const int32_t coreNum = numOfBlock_; - blockDimRes.batchDimFactors.reserve(coreNum); - blockDimRes.mDimFactors.reserve(coreNum); - blockDimRes.nDimFactors.reserve(coreNum); - blockDimRes.kDimFactors.reserve(coreNum); - MathUtil::GetBlockFactors(blockDimRes.batchDimFactors, params.batch32, params.batchMapped, coreNum, - min(coreNum, params.batch32)); - MathUtil::GetBlockFactors(blockDimRes.mDimFactors, params.m32, params.mMapped, coreNum, min(coreNum, params.m32)); - MathUtil::GetBlockFactors(blockDimRes.nDimFactors, params.n32, params.nMapped, coreNum, min(coreNum, params.n32)); - // first get kDim candidate - if (!tilingIns_->enableSplitK_) { - blockDimRes.kDimFactors.push_back(1); - params.kMapped = params.k32; - } else { - MathUtil::GetBlockFactors(blockDimRes.kDimFactors, params.k32, params.kMapped, coreNum, coreNum); - } - AddOptimalFactors(opType, params, blockDimRes); -} - -void MatmulTilingAlgorithm::GetBlockDim(const std::string& opType, MatmulRunParas& params, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes) -{ - // get batchDim, kDim, mDim and nDim for single core - // support multi cores slicing along kDim - // single core batchDim, mDim, nDim, kDim is a factor of input batch, m, n, k - // multi-core strategy for mini shape's is different from other situations and requires preprocess - if (PreProcessMiniShape(opType, coreStatus, params, numOfBlock_, tilingIns_->enableSplitK_)) { - // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. - coreStatus.batchDim = MathUtil::CeilDivision(params.batch32, coreStatus.batch); - coreStatus.nDim = MathUtil::CeilDivision(params.n32, coreStatus.n); - coreStatus.mDim = MathUtil::CeilDivision(params.m32, coreStatus.m); - coreStatus.kDim = MathUtil::CeilDivision(params.k32, coreStatus.k); - UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : - TilingPolicy::NO_POLICY, - coreStatus); - splitCoreFlag_ = true; - return; - } - GenBlockDimsMapFactors(opType, params, blockDimRes); - for (const int32_t bFactor : blockDimRes.batchDimFactors) { - for (const int32_t nFactor : blockDimRes.nDimFactors) { - if (IsInvalidFactor(bFactor * nFactor)) { - continue; - } - for (const int32_t mFactor : blockDimRes.mDimFactors) { - if (IsInvalidFactor(bFactor * nFactor * mFactor)) { - continue; - } - for (const int32_t kFactor : blockDimRes.kDimFactors) { - if (IsInvalidFactor(bFactor * nFactor * mFactor * kFactor)) { - continue; - } - DimFactor blockDim(bFactor, mFactor, kFactor, nFactor); - GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); - } - } - } - } - - coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDimRes.batchDimFactor); - coreStatus.n = MathUtil::CeilDivision(params.n32, blockDimRes.nDimFactor); - coreStatus.m = MathUtil::CeilDivision(params.m32, blockDimRes.mDimFactor); - coreStatus.k = MathUtil::CeilDivision(params.k32, blockDimRes.kDimFactor); - if (g_tempCfg.factorSplit) { - const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); - const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); - const int32_t k = MathUtil::FindBestSingleCore(params.k32, params.kMapped, blockDimRes.kDimFactor, true); - const int32_t needCoreNum = static_cast(MathUtil::CeilDivision(params.batch32, coreStatus.batch) * - MathUtil::CeilDivision(params.n32, n) * - MathUtil::CeilDivision(params.m32, m) * - MathUtil::CeilDivision(params.k32, k)); - if (IsInvalidFactor(needCoreNum) == false) { - coreStatus.n = n; - coreStatus.m = m; - coreStatus.k = k; - } - } - - params.nonFactorK = params.k32 == params.kMapped ? false : true; - UpdateMultiCore(opType, params, coreStatus, blockDimRes); -} - -void MatmulTilingAlgorithm::NonFactorMap(const std::string& opType, MatmulRunParas& param, - BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - param.batchMapped = param.batch32; - param.mMapped = param.m32; - param.kMapped = param.k32; - param.nMapped = param.n32; - // Split k will introduce atomic_add which can't be used with shift_inwards. - // Thus in split k mode, batch/m/n/ can't use non-factorial segmentation. - if (tilingIns_->enableSplitK_) { - // it is only necessary to consider the non-factor splitting of k when splitKFlag is true - int32_t kFactorLess64Cnt = 0; - int32_t kFactorLess1024Cnt = 0; - MathUtil::GetFactorCnt(param.k32, kFactorLess64Cnt, 1, L0_FACTOR_LIMIT); - MathUtil::GetFactorCnt(param.k32, kFactorLess1024Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT); - if ((param.k32 > L0_FACTOR_LIMIT && kFactorLess64Cnt <= L0_FACTOR_NUM_LIMIT) || - (param.k32 > L1_FACTOR_LIMIT && kFactorLess64Cnt + kFactorLess1024Cnt <= L1_FACTOR_NUM_LIMIT)) { - // Non-factors of the k dimension use a down-aligned number of powers of 2 - param.kMapped = MathUtil::MapShape(param.k32, false); - } - } else { - MathUtil::GetFactorCnt(param.batch32, blockDimRes.batchFactorCnt, 1, numOfBlock_); - if (param.batch32 > 1 && blockDimRes.batchFactorCnt <= L0_FACTOR_NUM_LIMIT) { - param.batchMapped = MathUtil::MapShape(param.batch32); - } - param.mMapped = MathUtil::MapShape(param.m32); - param.nMapped = MathUtil::MapShape(param.n32); - } -} - -void MatmulTilingAlgorithm::FillParam(MatmulRunParas& param) -{ - param.oriShapeM = tilingIns_->orgM; - param.oriShapeN = tilingIns_->orgN; - param.oriShapeKa = tilingIns_->orgKa; - param.oriShapeKb = tilingIns_->orgKb; - int32_t realM = 1; - int32_t realN = 1; - int32_t realK = 1; - - if (tilingIns_->singleCoreM != -1 || tilingIns_->singleCoreK != -1 || tilingIns_->singleCoreN != -1) { - realM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; - realK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; - realN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; - singelBlockDim_ = true; - numOfBlock_ = 1; - } else { - realM = GetSingleM(); - realK = GetSingleK(); - realN = GetSingleN(); - singelBlockDim_ = false; - numOfBlock_ = tilingIns_->blockDim; - } - - const int32_t reduceBlockSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - param.k32 = MathUtil::CeilDivision(realK, reduceBlockSize); - param.m32 = MathUtil::CeilDivision(realM, C0_SIZE); - param.n32 = MathUtil::CeilDivision(realN, C0_SIZE); - param.mMapped = MathUtil::MapShape(param.m32, true); - param.kMapped = MathUtil::MapShape(param.k32, true); - param.nMapped = MathUtil::MapShape(param.n32, true); -} - -bool MatmulTilingAlgorithm::CheckFinaleParams(const CoreStatusPack& coreStatus) const -{ - (void)coreStatus; - const int32_t stepM = tilingIns_->tiling_.get_stepM(); - const int32_t stepN = tilingIns_->tiling_.get_stepN(); - const int32_t depthA1 = tilingIns_->tiling_.get_depthA1(); - const int32_t depthB1 = tilingIns_->tiling_.get_depthB1(); - - const int32_t l1Size = tilingIns_->tiling_.get_shareL1Size(); - const int32_t l0CSize = tilingIns_->tiling_.get_shareL0CSize(); - const int32_t uBSize = tilingIns_->tiling_.get_shareUbSize(); - - if (stepM == 0 || stepN == 0 || depthA1 == 0 || depthB1 == 0) { - TILING_LOG_WARNING("stepM/N depthA1/B1 should greate then zeros"); - return false; - } - - if (stepM > depthA1 || stepN > depthB1) { - TILING_LOG_WARNING("stepM/N should less then depthA1/B1"); - return false; - } - - if (l1Size > tilingIns_->bufferPool_.l1Size || l0CSize > tilingIns_->bufferPool_.l0CSize || - uBSize > tilingIns_->bufferPool_.ubSize) { - TILING_LOG_WARNING("L1/L0C/UB used size should less then L1Size/L0CSize/UbSize"); - return false; - } - - int dateDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - if (tilingIns_->tiling_.get_BatchNum() > 0 && - ((tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK() + - tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK()) * - tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE + - tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE > - tilingIns_->bufferPool_.l1Size)) { - TILING_LOG_WARNING("a/b matrix size of batch mm should less then L1Size"); - return false; - } - - return true; -} - -void MatmulTilingAlgorithm::AdjustSparseL0Factors(SingleCoreStatus& singleCoreStatus) const -{ - // determine whether the scenario is sparse - if (!tilingIns_->isSparse_) { - TILING_LOG_DEBUG("Not sparse scenario does not need to adjust L0Factors."); - return; - } - - int32_t baseK = - singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - constexpr int32_t sparseBaseKFac = 64; // baseK need to align to 64 on Sparse - if (baseK <= sparseBaseKFac) { - baseK = sparseBaseKFac; - } else { - baseK = MathUtil::AlignDown(baseK, sparseBaseKFac); - } - singleCoreStatus.l0Status.kL0 = - baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - - // check L0A/L0B/L0Csize for L0 DB - int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - if (baseM * baseK > tilingIns_->bufferPool_.l0ASize / DB_ON) { - singleCoreStatus.l0Status.dbL0A = DB_OFF; - } - - if (baseN * baseK > tilingIns_->bufferPool_.l0BSize / DB_ON) { - singleCoreStatus.l0Status.dbL0B = DB_OFF; - } - - if (baseM * baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { - singleCoreStatus.l0Status.dbL0C = DB_OFF; - } -} - -void MatmulTilingAlgorithm::PreprocessL0DB() -{ - dbL0A_ = g_tempCfg.l0aDB; - dbL0B_ = g_tempCfg.l0bDB; - dbL0C_ = g_tempCfg.l0cDB; - if (tilingIns_->baseM != -1) { - const int32_t baseLeftSize = tilingIns_->baseM * C0_BYTE_SIZE; - if (baseLeftSize > tilingIns_->bufferPool_.l0ASize / DB_ON) { - dbL0A_ = DB_OFF; - } - } - if (tilingIns_->baseN != -1) { - const int32_t baseRightSize = tilingIns_->baseN * C0_BYTE_SIZE; - if (baseRightSize > tilingIns_->bufferPool_.l0BSize / DB_ON) { - dbL0B_ = DB_OFF; - } - } - if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { - const int32_t baseMatrixSize = tilingIns_->baseM * tilingIns_->baseN * C0_BYTE_SIZE; - if (baseMatrixSize > tilingIns_->bufferPool_.l0CSize / DB_ON) { - dbL0C_ = DB_OFF; - } - } - return; -} - -void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int32_t &b1LengthCache) const -{ - if (!tilingIns_->enableL1CacheUB || - tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND310P) { - return; - } - int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - a1LengthCache = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); - b1LengthCache = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); - int32_t freeL1Size = tilingIns_->bufferPool_.l1Size - tilingIns_->tiling_.get_depthA1() * a1Length - - tilingIns_->tiling_.get_depthB1() * b1Length; - if (freeL1Size <= 0) { - return; - } - const int32_t splitNum = 2; - int32_t aOrgShapeSize = tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK(); - int32_t bOrgShapeSize = tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK(); - - if ((tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) && - (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM)) { - bool aFullLoad = false; - bool bFullLoad = false; - aFullLoad = aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size / splitNum; - bFullLoad = bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size / splitNum; - if (aFullLoad && bFullLoad) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - tilingIns_->tiling_.set_depthBL1CacheUB(1); - a1LengthCache = aOrgShapeSize; // update - b1LengthCache = bOrgShapeSize; - } else if (aFullLoad) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - a1LengthCache = aOrgShapeSize; - int32_t depthL1CacheUB = b1LengthCache > 0 ? (freeL1Size - aOrgShapeSize) / b1LengthCache : 0; - tilingIns_->tiling_.set_depthBL1CacheUB(depthL1CacheUB); - } else if (bFullLoad) { - tilingIns_->tiling_.set_depthBL1CacheUB(1); - b1LengthCache = bOrgShapeSize; - int32_t depthL1CacheUB = a1LengthCache > 0 ? (freeL1Size - bOrgShapeSize) / a1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthL1CacheUB); - } else { - if (a1LengthCache > freeL1Size) { - int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / b1LengthCache : 0; - tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); - } else if (b1LengthCache > freeL1Size) { - int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / a1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); - } else if (a1LengthCache <= freeL1Size / splitNum && b1LengthCache <= freeL1Size / splitNum) { - int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / splitNum / a1LengthCache : 0; - int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / splitNum / b1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); - tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); - } else { - // can only cache one matrix - if (a1LengthCache <= b1LengthCache) { - tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); - } else { - tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); - } - } - } - } else if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) { - if (aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - a1LengthCache = aOrgShapeSize; - } else if (a1LengthCache > 0) { - tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); - } - } else if (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM) { - if (bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size) { - tilingIns_->tiling_.set_depthBL1CacheUB(1); - b1LengthCache = bOrgShapeSize; - } else if (b1LengthCache > 0) { - tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); - } - } else { - return; - } -} - -int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const -{ - int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; - // only bType is f32 need update - if (tilingIns_->bType_.dataType != DataType::DT_FLOAT - || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { - return depthB1; - } - uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; - uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; - - uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; - if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { - alignedBaseKM = alignedBaseK * tilingIns_->baseM; - } - // if L1 size is overflow, decrease depthB1 - if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) - > static_cast(tilingIns_->bufferPool_.l1Size)) { - depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; - depthB1 = depthB1 < 1 ? 1 : depthB1; - } - return depthB1; -} -int32_t MatmulTilingAlgorithm::GetSingleM() const -{ - return tilingIns_->singleM != -1 ? tilingIns_->singleM : tilingIns_->orgM; -} -int32_t MatmulTilingAlgorithm::GetSingleN() const -{ - return tilingIns_->singleN != -1 ? tilingIns_->singleN : tilingIns_->orgN; -} -int32_t MatmulTilingAlgorithm::GetSingleK() const -{ - return tilingIns_->singleK != -1 ? tilingIns_->singleK : tilingIns_->orgKa; -} -void MatmulTilingAlgorithm::GetSingleShape(const CoreStatusPack &coreStatus, const MatmulRunParas ¶m, - int32_t &singleCoreM, int32_t &singleCoreN, int32_t &singleCoreK) const -{ - singleCoreM = GetSingleM(); - singleCoreM = MathUtil::CeilDivision(singleCoreM, coreStatus.mDim); - singleCoreN = GetSingleN(); - singleCoreN = MathUtil::CeilDivision(singleCoreN, coreStatus.nDim); - singleCoreK = GetSingleK(); - singleCoreK = MathUtil::CeilDivision(singleCoreK, coreStatus.kDim); - if (singelBlockDim_) { - singleCoreM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; - singleCoreN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; - singleCoreK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; - } - if (numOfBlock_ > 1) { - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool needAlign = GetMultiCoreScenario(param) == MultiCoreScenario::SPLIT_MN; - bool needOutputAlign = NeedOutputAlign(singleCoreM, singleCoreN, singleCoreK); - (void)AlignSingleShape(needAlign && (!tilingIns_->bType_.isTrans || needOutputAlign), param.n32 * C0_SIZE, coreStatus.nDim, - bAlignSize, singleCoreN); - (void)AlignSingleShape(needAlign && tilingIns_->aType_.isTrans, param.m32 * C0_SIZE, coreStatus.mDim, - aAlignSize, singleCoreM); - if (tilingIns_->enableSplitK_) { - if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || - tilingIns_->bType_.dataType == DataType::DT_FLOAT) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * FLOAT32_REDUCE_BLOCK_SIZE; - } else if ((tilingIns_->aType_.dataType == DataType::DT_INT8 || - tilingIns_->bType_.dataType == DataType::DT_INT8)) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT8_REDUCE_BLOCK_SIZE; - } else if ((tilingIns_->aType_.dataType == DataType::DT_INT4 || - tilingIns_->bType_.dataType == DataType::DT_INT4)) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT4_REDUCE_BLOCK_SIZE; - } else { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * REDUCE_BLOCK_SIZE; - } - } - } -} - -bool MatmulTilingAlgorithm::CheckSingleShape(int32_t singleCoreM, int32_t singleCoreN, int32_t singleCoreK) const -{ - (void)singleCoreM; - (void)singleCoreK; - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad - if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && - (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { - TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " - "aligned, matrix c not support ND format"); - return false; - } - } - return true; -} - -int64_t MatmulTilingAlgorithm::Process() -{ - PreprocessL0DB(); - if (!CheckBaseMN()) { - TILING_LOG_WARNING("check baseM/baseN not pass"); - return -1; - } - singelBlockDim_ = false; - splitCoreFlag_ = false; - CoreStatusPack coreStatus; - SingleCoreStatus singleCoreStatus; - MatmulRunParas param; - BlockDimCalculator blockDimRes; - FillParam(param); - - std::string opType = "MatMul"; - if (numOfBlock_ != 1) { - NonFactorMap(opType, param, blockDimRes); - if (DoMultiCoreSplitMNTiling(param, coreStatus, blockDimRes)) { - return 0; - } - GetBlockDim(opType, param, coreStatus, blockDimRes); - } else { - if (!g_tempCfg.factorSplit) { - coreStatus.m = param.m32; - coreStatus.k = param.k32; - coreStatus.n = param.n32; - } else { - coreStatus.m = MathUtil::FindBestSingleCore(param.m32, param.mMapped, 1, false); - coreStatus.k = MathUtil::FindBestSingleCore(param.k32, param.kMapped, 1, false); - coreStatus.n = MathUtil::FindBestSingleCore(param.n32, param.nMapped, 1, false); - } - coreStatus.batchDim = 1; - coreStatus.mDim = 1; - coreStatus.kDim = 1; - coreStatus.nDim = 1; - } - - if (numOfBlock_ != 1 && tilingIns_->bType_.pos == TPosition::TSCM) { - if (!splitCoreFlag_) { - TILING_LOG_WARNING("Multi core split B TSCM full loaded is not sucess."); - return 1; - } - } - // single-core logic - GetL0Factors(opType, param, coreStatus, singleCoreStatus); - AdjustSparseL0Factors(singleCoreStatus); - if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || - singleCoreStatus.l0Status.kL0 == 0) { - TILING_LOG_WARNING("ml0/nl0/kl0 is zero"); - return -1; - } - GetL1Factors(opType, param, coreStatus, singleCoreStatus.l0Status, singleCoreStatus.l1Status); - - if (UpdateTiling(param, coreStatus, singleCoreStatus) == -1L) { - return -1L; - } - - const bool ans = CheckFinaleParams(coreStatus); - return ans ? 0 : -1; -} -} // namespace matmul_tiling -- Gitee From ae700ade1ab94a790169f1a4a634c1c94b6977c1 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 02:18:55 +0000 Subject: [PATCH 49/56] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=20impl/matmul?= =?UTF-8?q?/tiling/matmul=5Ftiling=5Falgorithm=5Fnew.cpp=20=E7=82=BA=20imp?= =?UTF-8?q?l/matmul/tiling/matmul=5Ftiling=5Falgorithm.cpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...atmul_tiling_algorithm_new.cpp => matmul_tiling_algorithm.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename impl/matmul/tiling/{matmul_tiling_algorithm_new.cpp => matmul_tiling_algorithm.cpp} (100%) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new.cpp b/impl/matmul/tiling/matmul_tiling_algorithm.cpp similarity index 100% rename from impl/matmul/tiling/matmul_tiling_algorithm_new.cpp rename to impl/matmul/tiling/matmul_tiling_algorithm.cpp -- Gitee From 10a2f92f0377a2ab5adc19dd0bc566bc6193e5a3 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 08:36:48 +0000 Subject: [PATCH 50/56] add Signed-off-by: jiangchengcheng-on --- .../matmul/tiling/matmul_tiling_algorithm.cpp | 1654 ++++++++++++++++- 1 file changed, 1653 insertions(+), 1 deletion(-) diff --git a/impl/matmul/tiling/matmul_tiling_algorithm.cpp b/impl/matmul/tiling/matmul_tiling_algorithm.cpp index 70db7cfc..378b5d9d 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm.cpp +++ b/impl/matmul/tiling/matmul_tiling_algorithm.cpp @@ -1553,4 +1553,1656 @@ void MatmulTilingAlgorithm::GetBankConflictSize(const L1StatusPack& l1Status, co if (isBankConflict) { length = length + bankConflictSize; } -} \ No newline at end of file +} + +void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) const +{ + constexpr int blockSize = 32; + constexpr int bankLen = 512; + bool isBankConflict = false; + int bankConflictSize = 0; + if (isAMatrix) { + if (tilingIns_->aType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepM() * tilingIns_->tiling_.get_baseM(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKa() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseM() * C0_SIZE * tilingIns_->tiling_.get_stepM() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } + } else { + if (tilingIns_->bType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseN() * C0_SIZE * tilingIns_->tiling_.get_stepN() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(tilingIns_->tiling_.get_stepN() * tilingIns_->tiling_.get_baseN(), C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKb() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } + } + if (isBankConflict) { + length = length + bankConflictSize; + } +} + +int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t a1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // A matrix ND2NZ + if (tilingIns_->aType_.type == CubeFormat::ND) { + a1Length = l0Status.mL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + a1Length = a1Length * MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l1Status.mAL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, a1Length, true); + } + } + return a1Length; +} + +int32_t MatmulTilingAlgorithm::GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t b1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // B matrix ND2NZ + if (tilingIns_->bType_.type == CubeFormat::ND) { + b1Length = l0Status.nL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + b1Length = b1Length * MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l1Status.nBL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, b1Length, false); + } + } + return b1Length; +} + +bool MatmulTilingAlgorithm::IsUbNd2Nz() const +{ + if (tilingIns_->enVecND2NZ && tilingIns_->mmConfigType == 1 && + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + return true; + } + return false; +} + +void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const +{ + int32_t a1Length = 0; + int32_t b1Length = 0; + int32_t c1Length = 0; + int32_t biasLength = 0; + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { + // A matrix ND2NZ + if (tilingIns_->aType_.type == CubeFormat::ND) { + a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + a1Length = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); + } + // bank conflict + GetBankConflictSize(a1Length, true); + } + // B matrix ND2NZ + if (tilingIns_->bType_.type == CubeFormat::ND + || (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) && + tilingIns_->bType_.type == CubeFormat::NZ && tilingIns_->bType_.isTrans == false)) { + b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + b1Length = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); + } + // bank conflict + GetBankConflictSize(b1Length, false); + } + // C matrix NZ2ND + if (tilingIns_->cType_.type == CubeFormat::ND || tilingIns_->cType_.pos == TPosition::GM) { + c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * + DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); + } + // Bias + if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) { + biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + } + // quant tensor + if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { + int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t); + biasLength = max(quantLength, biasLength); + } + } + + transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); +} + +bool MatmulTilingAlgorithm::CheckBaseMN() const +{ + // check bias table + if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && + tilingIns_->isBias && (tilingIns_->baseN > MAX_BIAS_N * C0_SIZE) && tilingIns_->isSupportL0c2Out) { + return false; + } + if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { + return (tilingIns_->baseM * tilingIns_->baseN * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize && + tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); + } + if (tilingIns_->baseM != -1) { + return (tilingIns_->baseM * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize); + } + if (tilingIns_->baseN != -1) { + return (tilingIns_->baseN * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && + tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); + } + return true; +} + +int32_t MatmulTilingAlgorithm::GetIteratorOrder(const SingleCoreStatus& singleCoreStatus, const int32_t singleCoreM, + const int32_t singleCoreN, const int32_t singleCoreK) const +{ + if (tilingIns_->traverse_ != MatrixTraverse::NOSET) { + return static_cast(tilingIns_->traverse_) - 1; + } + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + const bool fullkAL1Load = + (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kAL1 * reduceSize)) > 1.0 ? false : true; + bool fullkBL1Load = + (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kBL1 * reduceSize)) > 1.0 ? false : true; + + // if KAL1 and KBL1 both can not be full loaded, then select m or n which is no matter + if (!fullkAL1Load && !fullkBL1Load) { + return static_cast(MatrixTraverse::FIRSTM) - 1; + } else if (fullkAL1Load && !fullkBL1Load) { // if KAL1 is full loaded, then select the order N fist + return static_cast(MatrixTraverse::FIRSTN) - 1; + } else if (!fullkAL1Load && fullkBL1Load) { // if KBL1 is full loaded, then select the order M fist + return static_cast(MatrixTraverse::FIRSTM) - 1; + } else { + // if AL1LoadSize less then BL1LoadSize, then select order N first, vice versa. + const int32_t mLoop = MathUtil::CeilDivision(singleCoreM, + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l0Status.mL0 * C0_SIZE); + const int32_t nLoop = MathUtil::CeilDivision(singleCoreN, + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l0Status.nL0 * C0_SIZE); + const int32_t aL1LoadSize = singleCoreM + singleCoreN * mLoop; + const int32_t bL1LoadSize = singleCoreN + singleCoreM * nLoop; + return aL1LoadSize < bL1LoadSize ? 1 : 0; + } +} + +void MatmulTilingAlgorithm::UpdateBlockDimCalculator(BlockDimCalculator& blockDimRes) const +{ + if (blockDimRes.totalLoadSize > blockDimRes.tmpLoadSize) { + blockDimRes.bmatSize = blockDimRes.tmpBmatSize; + blockDimRes.amatSize = blockDimRes.tmpAmatSize; + blockDimRes.totalLoadSize = blockDimRes.tmpLoadSize; + blockDimRes.tmpValue = 0; + } +} + +void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes, const MatmulRunParas& params) const +{ + blockDimRes.totalLoadSize = INT_MAX; + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + const int32_t totalSize = blockDimRes.amatSize + blockDimRes.bmatSize; // batch==1 + constexpr int32_t minMNSize = 16; + constexpr int32_t minKSize = 64; + constexpr int32_t minTotalSize = 128; + const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); + const int32_t k0 = (m0 != 0 && n0 != 0) ? + min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; + const int32_t dbBuffer = 2; + + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + // loadsize = K*(N*mdim+M*ndim) + const bool bothFullLoad = static_cast(totalSize) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + const bool afulloadPlsBKFullLoad = + static_cast(blockDimRes.amatSize + n0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + const bool bfulloadPlsaKFullLoad = + static_cast(blockDimRes.bmatSize + m0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= + static_cast(tilingIns_->bufferPool_.l1Size); + if (afulloadPlsBKFullLoad || bfulloadPlsaKFullLoad || bothFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A kdim not fullload + B kdim not fullload(9) + // loadsize = M*K*N*(1/m0+1/n0) + const bool aKNotfulloadPlsbKNotFullLoad = + (n0 * blockDimRes.kBytes + m0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > + tilingIns_->bufferPool_.l1Size && + (m0 * blockDimRes.kBytes + n0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > + tilingIns_->bufferPool_.l1Size; + if (aKNotfulloadPlsbKNotFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A kdim fullload + B kdim fullload(5) + // M*K*(ndim+N/m1) or N*K*(mdim+M/n1) + const bool aKfulloadPlsbKFullLoad = (m0 + n0) * blockDimRes.kBytes * dbBuffer <= tilingIns_->bufferPool_.l1Size; + if (aKfulloadPlsbKFullLoad) { + const int32_t m1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - n0 * + blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * m0)) * m0; + const int32_t n1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - m0 * + blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * n0)) * n0; + const int32_t mfirstLoad = + blockDimRes.oriAmatSize * blockDims.n + blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); + int32_t nfirstLoad = + blockDimRes.oriBmatSize * blockDims.m + blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); + if (mfirstLoad < nfirstLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); + } else { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; + } + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + return; + } + + // A fullload + B Kdim not fullload or A K fullload + B Kdim not fullload(3/6) + // mdim = coreNum; ndim = 1; + // loadsize = M*K*(ndim+N/m0) + const bool afulloadPlsbKNotFullLoad = (blockDimRes.amatSize * blockDimRes.kBytes + + n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + const bool aKfulloadPlsbKNotFullLoad = (m0 * blockDimRes.kBytes * dbBuffer + + n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + if (afulloadPlsbKNotFullLoad || aKfulloadPlsbKNotFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; + blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + } + + // A kdim not fullload + B fullload or A kdim not fullload + B kdim fullload(7/8) + // loadsize = N*K*(mdim+M/n0) + const bool aKNotfulloadPlsbFullLoad = (blockDimRes.bmatSize * blockDimRes.kBytes + + m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + const bool aKNotfulloadPlsbKFullLoad = (n0 * blockDimRes.kBytes * dbBuffer + + m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; + if (aKNotfulloadPlsbFullLoad || aKNotfulloadPlsbKFullLoad) { + blockDimRes.tmpAmatSize = blockDimRes.oriBmatSize * blockDims.m; + blockDimRes.tmpBmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); + blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; + UpdateBlockDimCalculator(blockDimRes); + } +} + +int32_t MatmulTilingAlgorithm::LoopNumFromSingleCoreToL0(const CoreStatusPack& coreStatus, + const DimFactor& blockDimsFactor) const +{ + if (!blockDimsFactor.IsValid()) { + return 0; + } + constexpr int32_t minTotalSize = 128; + constexpr int32_t minSize = 64; + constexpr int32_t minN0Size = 16; + int32_t n0 = min(min(minN0Size, coreStatus.n), minSize); + int32_t m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); + n0 = (m0 == 0) ? 0 : min(min(coreStatus.n, minTotalSize / m0), minSize); + m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); + const int32_t k0 = (m0 != 0 && n0 != 0) ? + min(min(minSize / m0, minSize / n0), coreStatus.k) : coreStatus.k; + const int32_t loopNum = MathUtil::CeilDivision(coreStatus.m, m0) * MathUtil::CeilDivision(coreStatus.n, n0) * + MathUtil::CeilDivision(coreStatus.k, k0); + return loopNum; +} + +int32_t MatmulTilingAlgorithm::GetBigPackageCondition(const CoreStatusPack &coreStatus, + const BlockDimCalculator &blockDimRes, const MatmulRunParas ¶ms) const +{ + if (tilingIns_->bType_.isTrans == true && tilingIns_->aType_.isTrans == false) { + return ATTACH_FLAG_ZERO; + } + const int minSize = 16; + bool flag = true; + if (tilingIns_->bType_.isTrans == false) { + if (params.n32 >= minSize && coreStatus.n < minSize) { + flag = false; + } + } + if (tilingIns_->aType_.isTrans) { + if (params.m32 >= minSize && coreStatus.m < minSize) { + flag = false; + } + } + + if (!blockDimRes.bigPackage && !flag) { + return ATTACH_FLAG_ZERO; + } else if (!blockDimRes.bigPackage && flag) { + return ATTACH_FLAG_TWO; + } else if (blockDimRes.bigPackage && !flag) { + return ATTACH_FLAG_ONE; + } else { + return ATTACH_FLAG_ZERO; + } +} + +void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes, const MatmulRunParas& params) +{ + blockDimRes.kNum = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE); // contain k * 16 + blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 + coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); + coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); + coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); + coreStatus.k = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k); + if (tilingIns_->enableSplitK_) { + if (params.kMapped != params.k32) { // need check--splitK + blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; + coreStatus.k = params.kMapped / blockDim.k * NUM_TWO; + } + } + + // load size of A matrix is batch * m + // load size of B matrix is n + blockDimRes.oriAmatSize = params.batch32 * params.m32; + blockDimRes.oriBmatSize = params.oriShapeBbatch > 1 ? params.batch32 * params.n32 : params.n32; + blockDimRes.amatSize = coreStatus.batch * coreStatus.m; + blockDimRes.bmatSize = params.oriShapeBbatch > 1 ? coreStatus.batch * coreStatus.n : coreStatus.n; + blockDimRes.tmpValue = 0; + CalcLoadSize(blockDim, coreStatus, blockDimRes, params); + if (tilingIns_->enableSplitK_) { + blockDimRes.totalLoadSize *= coreStatus.k; + } + + // updateSolution: bool whether update to a new block factor solution + // has smaller LoadSize or the same LoadSize but batch + const int bigpackageFlag = GetBigPackageCondition(coreStatus, blockDimRes, params); + const bool updateConditionBp = bigpackageFlag == 0 ? false : true; + bool updateConditionBp2 = bigpackageFlag == 2 ? true : false; + bool updateConditionBp3 = bigpackageFlag == 1 ? false : true; + + const int32_t loopNum = LoopNumFromSingleCoreToL0(coreStatus, blockDim); + const bool updateConditionCoreUsed = (!updateConditionBp) && ((loopNum < blockDimRes.loopNumToL0) || + (blockDim.ReduceMul() > blockDimRes.coreUse && loopNum == blockDimRes.loopNumToL0)); + const bool updateConditionLoadsize = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse) && + blockDimRes.totalLoadSize < blockDimRes.minLoadSize; + const int32_t orgBatchM = params.oriShapeAbatch > 1 ? blockDimRes.batchDimFactor : blockDimRes.mDimFactor; + const int32_t curBatchM = params.oriShapeAbatch > 1 ? blockDim.batch : blockDim.m; + const bool updateConditionBatchNDim = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse && + blockDimRes.totalLoadSize == blockDimRes.minLoadSize) && + ((blockDimRes.nDimFactor * orgBatchM < curBatchM * blockDim.n) || + (blockDimRes.nDimFactor * orgBatchM == curBatchM * blockDim.n && + blockDimRes.batchDimFactor < blockDim.batch)); + + const bool policyCondition = + UserPolicy(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, + coreStatus, blockDimRes); + if ((updateConditionBp2 || updateConditionCoreUsed || updateConditionLoadsize || updateConditionBatchNDim) && + policyCondition && updateConditionBp3) { + blockDimRes.minLoadSize = blockDimRes.totalLoadSize; + blockDimRes.nDimFactor = blockDim.n; + blockDimRes.batchDimFactor = blockDim.batch; + blockDimRes.mDimFactor = blockDim.m; + blockDimRes.kDimFactor = blockDim.k; + blockDimRes.coreUse = blockDim.ReduceMul(); + blockDimRes.loopNumToL0 = loopNum; + blockDimRes.finalValue = blockDimRes.tmpValue; + const int32_t minSize = 16; + blockDimRes.bigPackage = (!tilingIns_->bType_.isTrans ? coreStatus.n >= minSize : true) && + (tilingIns_->aType_.isTrans ? coreStatus.m >= minSize : true) && (blockDim.n * blockDim.m * blockDim.k > 1); + splitCoreFlag_ = true; + } +} + +bool MatmulTilingAlgorithm::UserPolicy(const TilingPolicy policy, const CoreStatusPack& coreStatus, + const BlockDimCalculator& blockDimRes) const +{ + constexpr int32_t minMNSize = 16; + constexpr int32_t minKSize = 64; + constexpr int32_t minTotalSize = 128; + const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 + const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); + const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; + + if (policy == TilingPolicy::FIXED_B_TSCM) { + const int32_t alignFactor = MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE); + if (coreStatus.n < alignFactor) { + return false; + } + const int32_t alignNLength = MathUtil::Align(coreStatus.n, alignFactor); + const int32_t bMatrixSize = alignNLength * blockDimRes.kBytes * 2; + int32_t aMatrixSize = m0 * k0 * C0_SIZE * C0_BYTE_SIZE; + int32_t biasSize = 0; + if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { + biasSize = alignNLength * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); + } + if (bMatrixSize + aMatrixSize + biasSize <= tilingIns_->bufferPool_.l1Size) { + return true; + } else { + return false; + } + } else if (policy == TilingPolicy::FIXED_A_TSCM) { + return false; + } else if (policy == TilingPolicy::FIXED_A_B_TSCM) { + return false; + } else { + return true; + } +} + +bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreStatusPack& coreStatus, + MatmulRunParas& params, const int32_t& coreNum, bool splitKFlag) const +{ + (void)(opType); + // experience value for mini shape + const int32_t miniL0cThreshold = tilingIns_->bufferPool_.l0CSize / MIN_FRACTAL_SIZE / FP32_BYTES; + const int32_t miniL0abThreshold = tilingIns_->bufferPool_.l0ASize / (C0_SIZE * C0_BYTE_SIZE); + // tend to use less cores for shapes with batch less than coreNum and m/k/n can full load in + // aicore buffers split_k is conflict with m/n shift_inwards + bool specialScenario = false; + if (params.n32 > MIN_MTE1_LOAD) { + specialScenario = specialScenario || + (splitKFlag && ((static_cast(params.nMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); + } + if (params.m32 > MIN_MTE1_LOAD) { + specialScenario = specialScenario || + (splitKFlag && ((static_cast(params.mMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); + } + + if (params.batch32 * params.n32 * params.m32 <= coreNum && params.m32 * params.k32 <= miniL0abThreshold && + params.n32 * params.k32 <= miniL0abThreshold && params.m32 * params.n32 <= miniL0cThreshold && + !specialScenario) { + coreStatus.batchDim = params.batch32; + coreStatus.nDim = params.n32 <= MIN_MTE1_LOAD ? 1 : params.nMapped / MIN_MTE1_LOAD; + coreStatus.mDim = params.m32 <= MIN_MTE1_LOAD ? 1 : params.mMapped / MIN_MTE1_LOAD; + int32_t kDimCandidate[2] = {0}; // storage 2 factors of k around kDim + GetTwoFactors(kDimCandidate, coreStatus.kDim, params.k32, coreNum); + coreStatus.kDim = (params.k32 <= MIN_MTE1_LOAD || !splitKFlag) ? + 1 : + (kDimCandidate[1] > 1 ? kDimCandidate[1] : kDimCandidate[0]); + coreStatus.batch = 1; + coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); + coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); + coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); + params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); + return true; + } + return false; +} +float MatmulTilingAlgorithm::CalculateBlockCycles(int32_t baseM, int32_t baseN, int32_t baseK) const +{ + const int32_t reduceBlockSize = C0_BYTE_SIZE * BITS_PER_BYTE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + return static_cast(baseM * baseN * baseK) / (C0_SIZE * C0_SIZE * reduceBlockSize); +} + +int32_t MatmulTilingAlgorithm::CalculateMemoryTraffic(int32_t baseM, int32_t baseN, int32_t baseK) const +{ + int32_t aMatrixSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + int32_t bMatrixSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + return aMatrixSize + bMatrixSize; +} + +bool MatmulTilingAlgorithm::AlignSingleShape(bool needAlign, int32_t orgShape, int32_t factor, int32_t alignSize, + int32_t &singleShape) const +{ + singleShape = MathUtil::CeilDivision(orgShape, factor); + if (!needAlign || alignSize == 0 || orgShape % alignSize != 0) { + return true; // orgShape not align, don't need to adjust + } + if (factor <= 1) { + return true; + } + int32_t maxSingleShape = MathUtil::CeilDivision(orgShape, factor - 1); + int32_t alignSingleShape = MathUtil::Align(singleShape, alignSize); + if (alignSingleShape >= maxSingleShape) { + return false; + } + singleShape = alignSingleShape; + return true; +} + +ComputeBaseBlock MatmulTilingAlgorithm::GetMultiCoreBasicBlock(const MatmulRunParas& params) const +{ + (void)params; + constexpr static int32_t l0c256KB = 262144; + constexpr static int32_t basicSize128 = 128; + constexpr static int32_t basicSize256 = 256; + int32_t basicM = basicSize128; + if (tilingIns_->bufferPool_.l0CSize == l0c256KB) { + basicM = basicSize256; + } + int32_t basicN = basicSize256; + int32_t aDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) != 0 ? + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) : 1; + int32_t basicK = basicSize128 * BITS_PER_BYTE / aDtypeSize; + ComputeBaseBlock basicBlock {basicM, basicN, basicK}; + // SetFixSplit + if (tilingIns_->baseM != -1) { + basicBlock.baseM = tilingIns_->baseM; + } + if (tilingIns_->baseN != -1) { + basicBlock.baseN = tilingIns_->baseN; + } + if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { + return basicBlock; + } + if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { + basicBlock.baseM = tilingIns_->baseM != -1 ? basicBlock.baseM : basicSize256; + basicBlock.baseN = tilingIns_->baseN != -1 ? basicBlock.baseN : basicSize128; + return basicBlock; + } + + return basicBlock; +} + +float MatmulTilingAlgorithm::CalcBaseBlockBandRatio(int32_t mDim, int32_t nDim, const ComputeBaseBlock &baseBlock) const +{ + float bandRatio = static_cast((numOfBlock_ - mDim) * baseBlock.baseM + (numOfBlock_ - nDim) * baseBlock.baseN) / + static_cast((baseBlock.baseM + baseBlock.baseN) * numOfBlock_); + return bandRatio; +} + +ComputeIntensity MatmulTilingAlgorithm::CalcComputeIntensity(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + const std::pair &factor) const +{ + auto mFactor = factor.first; + auto nFactor = factor.second; + int32_t sm = 0; + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + bool alignSuccA = AlignSingleShape(tilingIns_->aType_.isTrans, params.oriShapeM, mFactor, aAlignSize, sm); + int32_t sn = 0; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool alignSuccB = AlignSingleShape(!tilingIns_->bType_.isTrans, params.oriShapeN, nFactor, bAlignSize, sn); + auto shapeM = MathUtil::DivideIntoMainAndTail(sm, baseBlock.baseM); + auto shapeN = MathUtil::DivideIntoMainAndTail(sn, baseBlock.baseN); + auto mainM = shapeM.first; + auto tailM = shapeM.second; + auto mainN = shapeN.first; + auto tailN = shapeN.second; + int32_t memoryRatio = (alignSuccA && alignSuccB) ? 1 : 2; + float bandRatio = CalcBaseBlockBandRatio(mFactor, nFactor, baseBlock); + std::vector blocks; + // Main Chunk + if (mainM > 0 && mainN > 0) { + int count = mainM * mainN; + float cycles = CalculateBlockCycles(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; + int32_t memory = memoryRatio * + CalculateMemoryTraffic(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; + blocks.push_back({count, cycles, memory}); + } + // N Tail Chunk + if (mainM > 0 && tailN > 0) { + float cycles = CalculateBlockCycles(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; + int32_t memory = memoryRatio * CalculateMemoryTraffic(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; + blocks.push_back({mainM, cycles, memory}); + } + // M Tail Chunk + if (tailM > 0 && mainN > 0) { + float cycles = CalculateBlockCycles(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; + int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; + blocks.push_back({mainN, cycles, memory}); + } + // M and N Tail Chunk + if (tailM > 0 && tailN > 0) { + float cycles = CalculateBlockCycles(tailM, tailN, baseBlock.baseK); + int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, tailN, baseBlock.baseK); + blocks.push_back({1, cycles, memory}); + } + float totalCycles = 0; + int32_t totalMemory = 0; + for (const auto& v : blocks) { + totalCycles += v.computeCycle; + totalMemory += v.memoryTraffic; + } + return { + {mFactor, nFactor}, totalCycles, (totalMemory != 0) ? totalCycles / totalMemory : 0, bandRatio}; +} + +MultiCoreScenario MatmulTilingAlgorithm::GetMultiCoreScenario(const MatmulRunParas& params) const +{ + if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + return MultiCoreScenario::OTHERS; + } + if (tilingIns_->enableSplitK_ || tilingIns_->singleM != -1 || tilingIns_->singleN != -1) { + return MultiCoreScenario::OTHERS; + } + constexpr int64_t mnLimit = 26214; // 128 * 256 * 0.8 + constexpr int64_t mLimit = 128; + if (params.oriShapeM >= mLimit && params.oriShapeM * params.oriShapeN > mnLimit * numOfBlock_) { + return MultiCoreScenario::SPLIT_MN; + } + return MultiCoreScenario::OTHERS; +} + +void MatmulTilingAlgorithm::UpdateStepK(const ComputeBaseBlock &baseBlock, int32_t &stepK) const +{ + if (stepK * baseBlock.baseK >= GetSingleK()) { + return; + } + constexpr static int32_t baseBlockSize512 = 512; + constexpr static int32_t baseBlockSize256 = 256; + int32_t aTypeBitSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize512) { + if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0) && + (baseBlockSize512 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { + while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0 && stepK > 1) { + stepK--; + } + } + } else if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize256) { + if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0) && + (baseBlockSize256 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { + while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0 && stepK > 1) { + stepK--; + } + } + } +} + +void MatmulTilingAlgorithm::CalcL1Tiling(const ComputeBaseBlock &baseBlock, int32_t &depthA1, int32_t &depthB1, + int32_t &stepKa, int32_t &stepKb) +{ + int32_t l1Size = tilingIns_->bufferPool_.l1Size; + constexpr static int32_t reservedL1Size = 256; // l1 reserved 256B + int32_t depthA1Size = (l1Size / DB_ON / baseBlock.baseM / baseBlock.baseK) * BITS_PER_BYTE / + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t depthB1Size = ((l1Size + reservedL1Size) / DB_ON / baseBlock.baseN / baseBlock.baseK) * BITS_PER_BYTE / + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); + int32_t btSize = tilingIns_->isBias ? tilingIns_->bufferPool_.btSize / BITS_PER_BYTE : 0; + if (depthA1Size + depthB1Size > l1Size - btSize) { + if (baseBlock.baseM <= baseBlock.baseN) { + depthA1Size = depthA1Size / DB_ON; + } else { + depthB1Size = depthB1Size / DB_ON; + } + } + int32_t l1Db = g_tempCfg.l1DB == DB_OFF ? DB_OFF : DB_ON; + stepKa = depthA1Size / l1Db; + stepKb = depthB1Size / l1Db; + UpdateStepK(baseBlock, stepKa); + UpdateStepK(baseBlock, stepKb); + if (stepKa >= stepKb && stepKb != 0) { + stepKa = stepKa / stepKb * stepKb; + } else if (stepKa != 0) { + stepKb = stepKb / stepKa * stepKa; + } + depthA1 = stepKa * l1Db; + depthB1 = stepKb * l1Db; +} + +L0StatusPack MatmulTilingAlgorithm::GetL0CoreStatus(const ComputeBaseBlock &baseBlock) const +{ + L0StatusPack l0Status; + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + l0Status.dbL0C = g_tempCfg.l0cDB; + if (baseBlock.baseM * baseBlock.baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { + l0Status.dbL0C = DB_OFF; + } + l0Status.dbL0A = DB_ON; + l0Status.dbL0B = DB_ON; + l0Status.mL0 = baseBlock.baseM / C0_SIZE; + l0Status.kL0 = baseBlock.baseK / reduceSize; + l0Status.nL0 = baseBlock.baseN / C0_SIZE; + return l0Status; +} + +L1StatusPack MatmulTilingAlgorithm::GetL1CoreStatus(const ComputeBaseBlock &baseBlock, int32_t depthA1, int32_t depthB1, + int32_t stepKa, int32_t stepKb) const +{ + L1StatusPack l1Status; + l1Status.mAL1 = 1; + l1Status.nBL1 = 1; + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + l1Status.kAL1 = baseBlock.baseK / reduceSize * stepKa; + l1Status.kBL1 = baseBlock.baseK / reduceSize * stepKb; + l1Status.dbAL1 = depthA1 >= stepKa * DB_ON ? DB_ON : DB_OFF; + l1Status.dbBL1 = depthB1 >= stepKb * DB_ON ? DB_ON : DB_OFF; + return l1Status; +} + +void MatmulTilingAlgorithm::UpdateShapeAndLayout() const +{ + tilingIns_->tiling_.set_M(tilingIns_->orgM); + tilingIns_->tiling_.set_N(tilingIns_->orgN); + tilingIns_->tiling_.set_Ka(tilingIns_->orgKa); + tilingIns_->tiling_.set_Kb(tilingIns_->orgKb); + tilingIns_->tiling_.set_batchM(tilingIns_->batchM); + tilingIns_->tiling_.set_batchN(tilingIns_->batchN); + tilingIns_->tiling_.set_singleBatchM(tilingIns_->singleBatchM); + tilingIns_->tiling_.set_singleBatchN(tilingIns_->singleBatchN); + + tilingIns_->tiling_.set_ALayoutInfoB(tilingIns_->aLayoutInfoB); + tilingIns_->tiling_.set_ALayoutInfoS(tilingIns_->aLayoutInfoS); + tilingIns_->tiling_.set_ALayoutInfoN(tilingIns_->aLayoutInfoN); + tilingIns_->tiling_.set_ALayoutInfoG(tilingIns_->aLayoutInfoG); + tilingIns_->tiling_.set_ALayoutInfoD(tilingIns_->aLayoutInfoD); + + tilingIns_->tiling_.set_BLayoutInfoB(tilingIns_->bLayoutInfoB); + tilingIns_->tiling_.set_BLayoutInfoS(tilingIns_->bLayoutInfoS); + tilingIns_->tiling_.set_BLayoutInfoN(tilingIns_->bLayoutInfoN); + tilingIns_->tiling_.set_BLayoutInfoG(tilingIns_->bLayoutInfoG); + tilingIns_->tiling_.set_BLayoutInfoD(tilingIns_->bLayoutInfoD); + + tilingIns_->tiling_.set_CLayoutInfoB(tilingIns_->cLayoutInfoB); + tilingIns_->tiling_.set_CLayoutInfoS1(tilingIns_->cLayoutInfoS1); + tilingIns_->tiling_.set_CLayoutInfoN(tilingIns_->cLayoutInfoN); + tilingIns_->tiling_.set_CLayoutInfoG(tilingIns_->cLayoutInfoG); + tilingIns_->tiling_.set_CLayoutInfoS2(tilingIns_->cLayoutInfoS2); + tilingIns_->tiling_.set_BatchNum(tilingIns_->batchNum); + return; +} + +void MatmulTilingAlgorithm::UpdateUsedSize() const +{ + int32_t transLength = 0; + GetTransLength(transLength); + int32_t a1LengthCache = 0; + int32_t b1LengthCache = 0; + SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); + tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub + tilingIns_->tiling_.set_shareMode(0); + int32_t l1Size = 0; + int32_t l0cSize = 0; + int32_t ubSize = 0; + GetUsedSize(l1Size, l0cSize, ubSize, a1LengthCache, b1LengthCache); + tilingIns_->tiling_.set_shareL1Size(l1Size); + tilingIns_->tiling_.set_shareL0CSize(l0cSize); + tilingIns_->tiling_.set_shareUbSize(ubSize); +} + +int64_t MatmulTilingAlgorithm::AdjustOuterProductL0Factor(const SingleCoreStatus& singleCoreStatus) const +{ + if (tilingIns_->scheduleType != ScheduleType::OUTER_PRODUCT) { + return 0; + } + // check whether OUTER_PRODUCT is supported + if ((tilingIns_->tiling_.get_baseK() < tilingIns_->tiling_.get_singleCoreK()) && + ((tilingIns_->mmConfigType == 1) || ((tilingIns_->mmConfigType == 0) && + (tilingIns_->batchNum != 0)))) { + TILING_LOG_WARNING("Unsupported scheduleType is OUTER_PRODUCT"); + return -1L; + } + int32_t newBaseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t newBaseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + // when scheduleType is OUTER_PRODUCT, each iteration computes 2 * basicBlock size of data + bool isL0CFullUsed = (newBaseM * newBaseN * NUM_TWO * + static_cast(DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType))) > + static_cast(tilingIns_->bufferPool_.l0CSize) ? true : false; + if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 0)) { + // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_M, N db in L0 + newBaseN = MathUtil::Align(newBaseN / NUM_TWO, C0_SIZE); + } else if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 1)) { + // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_N, M db in L0 + newBaseM = MathUtil::Align(newBaseM / NUM_TWO, C0_SIZE); + } + tilingIns_->tiling_.set_baseM(newBaseM); + tilingIns_->tiling_.set_baseN(newBaseN); + return 0; +} + +void MatmulTilingAlgorithm::AdjustFloatL1Factor(const SingleCoreStatus& singleCoreStatus) const +{ + if (DTYPE_BYTE_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BYTE_TAB.at(DataType::DT_FLOAT)) { + if (tilingIns_->tiling_.get_baseK() == DT_FLOAT_INVALID_BASEK) { + tilingIns_->tiling_.set_stepKb(1); + tilingIns_->tiling_.set_depthB1(singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); + } + } +} + +int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) +{ + int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : + coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; + int32_t singleCoreM; + int32_t singleCoreN; + int32_t singleCoreK; + GetSingleShape(coreStatus, param, singleCoreM, singleCoreN, singleCoreK); + if (!CheckSingleShape(singleCoreM, singleCoreN, singleCoreK)) { + return -1L; + } + tilingIns_->tiling_.set_usedCoreNum(coreUse); + tilingIns_->tiling_.set_singleCoreM(singleCoreM); + tilingIns_->tiling_.set_singleCoreN(singleCoreN); + tilingIns_->tiling_.set_singleCoreK(singleCoreK); + UpdateShapeAndLayout(); + tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE); + tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE); + const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); + tilingIns_->tiling_.set_iterateOrder(GetIteratorOrder(singleCoreStatus, singleCoreM, singleCoreN, singleCoreK)); + // check whether OUTER_PRODUCT is supported + if (AdjustOuterProductL0Factor(singleCoreStatus) != 0) { + return -1L; + } + tilingIns_->baseM = tilingIns_->tiling_.get_baseM(); + tilingIns_->baseN = tilingIns_->tiling_.get_baseN(); + tilingIns_->baseK = tilingIns_->tiling_.get_baseK(); + AdjustMxL1Factors(singleCoreStatus, reduceSize); + int32_t mxTypePara = 0; + GetMxScaleFactor(singleCoreStatus, reduceSize, mxTypePara); + tilingIns_->tiling_.set_mxTypePara(mxTypePara); + tilingIns_->tiling_.set_depthA1( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); + tilingIns_->tiling_.set_depthB1(UpdateDepthB1(singleCoreStatus)); + // if decrease depthB1, nBL1 must decrease to ensure nBL1 is less then depthB1 + singleCoreStatus.l1Status.nBL1 = min(singleCoreStatus.l1Status.nBL1, tilingIns_->tiling_.get_depthB1()); + tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); + tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); + tilingIns_->tiling_.set_stepKa( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); + tilingIns_->tiling_.set_stepKb( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); + AdjustFloatL1Factor(singleCoreStatus); + tilingIns_->tiling_.set_isBias(tilingIns_->isBias ? 1 : 0); + tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); + tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); + tilingIns_->tiling_.set_dbL0C(singleCoreStatus.l0Status.dbL0C); + UpdateUsedSize(); + return 0; +} + +bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& params, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes) +{ + if (GetMultiCoreScenario(params) != MultiCoreScenario::SPLIT_MN) { + return false; + } + ComputeBaseBlock baseBlock = GetMultiCoreBasicBlock(params); // calc basic block + CalcMultiCoreBlockDims(params, baseBlock, coreStatus, blockDimRes); + SingleCoreStatus singleCoreStatus; + singleCoreStatus.l0Status = GetL0CoreStatus(baseBlock); + AdjustSparseL0Factors(singleCoreStatus); + AdjustMxL0Factors(singleCoreStatus); + int32_t depthA1; + int32_t depthB1; + int32_t stepKa; + int32_t stepKb; + CalcL1Tiling(baseBlock, depthA1, depthB1, stepKa, stepKb); + singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); + (void)UpdateTiling(params, coreStatus, singleCoreStatus); + return true; +} + +bool MatmulTilingAlgorithm::NeedOutputAlign(int32_t m, int32_t n, int32_t k) const +{ + int32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); + int32_t cTypeSize = DTYPE_BIT_TAB.at(tilingIns_->cType_.dataType); + constexpr static int32_t outputRatio = 2; + bool needAlign = static_cast(n * m) * static_cast(outputRatio * cTypeSize) > + static_cast(n * k* aTypeSize) + static_cast(m * k * bTypeSize); + return needAlign; +} + +void MatmulTilingAlgorithm::CalcMultiCoreBlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes) +{ + auto factors = MathUtil::GetFactorPairs(numOfBlock_); + std::vector results; + for (const auto& factor : factors) { + results.push_back(CalcComputeIntensity(params, baseBlock, factor)); + } + // 排序结果 + std::sort(results.begin(), results.end()); + for (auto v : results) { + TILING_LOG_DEBUG("intent:%f, cycle: %f, band: %f, mDim: %d, nDim: %d\n", + v.avgIntensity, v.computeCycle, v.bandRatio, v.dimFactor.first, v.dimFactor.second); + } + coreStatus.batchDim = 1; + blockDimRes.nDimFactor = results[0].dimFactor.second; + blockDimRes.mDimFactor = results[0].dimFactor.first; + blockDimRes.kDimFactor = 1; + coreStatus.mDim = results[0].dimFactor.first; + coreStatus.nDim = results[0].dimFactor.second; + coreStatus.kDim = 1; + const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); + const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool needOutputAlign = NeedOutputAlign(m, n, GetSingleK()); + (void)AlignSingleShape((!tilingIns_->bType_.isTrans || needOutputAlign), n, coreStatus.nDim, bAlignSize, coreStatus.n); + (void)AlignSingleShape(tilingIns_->aType_.isTrans, m, coreStatus.mDim, aAlignSize, coreStatus.m); + blockDimRes.kNum = params.k32 / coreStatus.kDim * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 + blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 + coreStatus.batch = params.batch32; + coreStatus.k = params.k32 / coreStatus.kDim; + TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d", coreStatus.m, coreStatus.n, coreStatus.k); + // load size of A matrix is batch * m + // load size of B matrix is n + DimFactor blockDim(1, blockDimRes.mDimFactor, blockDimRes.kDimFactor, blockDimRes.nDimFactor); + GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); + return; +} + +void MatmulTilingAlgorithm::UpdateMultiCore(const std::string& opType, const MatmulRunParas& params, + CoreStatusPack& coreStatus, const BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. + coreStatus.batchDim = min(MathUtil::CeilDivision(params.batch32, coreStatus.batch), numOfBlock_); + coreStatus.nDim = min(MathUtil::CeilDivision(params.n32, coreStatus.n), numOfBlock_); + coreStatus.mDim = min(MathUtil::CeilDivision(params.m32, coreStatus.m), numOfBlock_); + + if (tilingIns_->enableSplitK_) { + coreStatus.kDim = min(MathUtil::CeilDivision(params.k32, coreStatus.k), numOfBlock_); + } else { + coreStatus.kDim = blockDimRes.kDimFactor; + } + UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, + coreStatus); +} + +void MatmulTilingAlgorithm::UpdateBufferSize(const TilingPolicy policy, const CoreStatusPack& coreStatus) const +{ + if (policy == TilingPolicy::NO_POLICY) { + return; + } else if (policy == TilingPolicy::FIXED_B_TSCM) { + const int32_t bMatrixSize = + MathUtil::Align(coreStatus.n, MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE)) * coreStatus.k * + C0_SIZE * C0_BYTE_SIZE * 2; + tilingIns_->bufferPool_.l1Size -= bMatrixSize; + } else if (policy == TilingPolicy::FIXED_A_TSCM) { + const int32_t aMatrixSize = coreStatus.m * coreStatus.k * C0_SIZE * C0_BYTE_SIZE * 2; + tilingIns_->bufferPool_.l1Size -= aMatrixSize; + } else { + return; + } +} + +bool MatmulTilingAlgorithm::IsInvalidFactor(int32_t factor) const +{ + return factor > numOfBlock_ || factor <= 0; +} + +void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const MatmulRunParas& params, + BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + const int32_t coreNum = numOfBlock_; + // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) + const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); + if (mnCore > 1) { + const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); + const int32_t mdim = static_cast(ceil(optPoint)); + const int32_t ndim = static_cast(ceil(mnCore / optPoint)); + MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); + MathUtil::AddFactor(blockDimRes.mDimFactors, ndim == 0 ? 1 : mnCore / ndim); + MathUtil::AddFactor(blockDimRes.nDimFactors, ndim); + MathUtil::AddFactor(blockDimRes.nDimFactors, mdim == 0 ? 1 : mnCore / mdim); + } +} + +void MatmulTilingAlgorithm::GenBlockDimsMapFactors(const std::string& opType, MatmulRunParas& params, + BlockDimCalculator& blockDimRes) const +{ + const int32_t coreNum = numOfBlock_; + blockDimRes.batchDimFactors.reserve(coreNum); + blockDimRes.mDimFactors.reserve(coreNum); + blockDimRes.nDimFactors.reserve(coreNum); + blockDimRes.kDimFactors.reserve(coreNum); + MathUtil::GetBlockFactors(blockDimRes.batchDimFactors, params.batch32, params.batchMapped, coreNum, + min(coreNum, params.batch32)); + MathUtil::GetBlockFactors(blockDimRes.mDimFactors, params.m32, params.mMapped, coreNum, min(coreNum, params.m32)); + MathUtil::GetBlockFactors(blockDimRes.nDimFactors, params.n32, params.nMapped, coreNum, min(coreNum, params.n32)); + // first get kDim candidate + if (!tilingIns_->enableSplitK_) { + blockDimRes.kDimFactors.push_back(1); + params.kMapped = params.k32; + } else { + MathUtil::GetBlockFactors(blockDimRes.kDimFactors, params.k32, params.kMapped, coreNum, coreNum); + } + AddOptimalFactors(opType, params, blockDimRes); +} + +void MatmulTilingAlgorithm::GetBlockDim(const std::string& opType, MatmulRunParas& params, CoreStatusPack& coreStatus, + BlockDimCalculator& blockDimRes) +{ + // get batchDim, kDim, mDim and nDim for single core + // support multi cores slicing along kDim + // single core batchDim, mDim, nDim, kDim is a factor of input batch, m, n, k + // multi-core strategy for mini shape's is different from other situations and requires preprocess + if (PreProcessMiniShape(opType, coreStatus, params, numOfBlock_, tilingIns_->enableSplitK_)) { + // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. + coreStatus.batchDim = MathUtil::CeilDivision(params.batch32, coreStatus.batch); + coreStatus.nDim = MathUtil::CeilDivision(params.n32, coreStatus.n); + coreStatus.mDim = MathUtil::CeilDivision(params.m32, coreStatus.m); + coreStatus.kDim = MathUtil::CeilDivision(params.k32, coreStatus.k); + UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : + TilingPolicy::NO_POLICY, + coreStatus); + splitCoreFlag_ = true; + return; + } + GenBlockDimsMapFactors(opType, params, blockDimRes); + for (const int32_t bFactor : blockDimRes.batchDimFactors) { + for (const int32_t nFactor : blockDimRes.nDimFactors) { + if (IsInvalidFactor(bFactor * nFactor)) { + continue; + } + for (const int32_t mFactor : blockDimRes.mDimFactors) { + if (IsInvalidFactor(bFactor * nFactor * mFactor)) { + continue; + } + for (const int32_t kFactor : blockDimRes.kDimFactors) { + if (IsInvalidFactor(bFactor * nFactor * mFactor * kFactor)) { + continue; + } + DimFactor blockDim(bFactor, mFactor, kFactor, nFactor); + GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); + } + } + } + } + + coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDimRes.batchDimFactor); + coreStatus.n = MathUtil::CeilDivision(params.n32, blockDimRes.nDimFactor); + coreStatus.m = MathUtil::CeilDivision(params.m32, blockDimRes.mDimFactor); + coreStatus.k = MathUtil::CeilDivision(params.k32, blockDimRes.kDimFactor); + if (g_tempCfg.factorSplit) { + const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); + const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); + const int32_t k = MathUtil::FindBestSingleCore(params.k32, params.kMapped, blockDimRes.kDimFactor, true); + const int32_t needCoreNum = static_cast(MathUtil::CeilDivision(params.batch32, coreStatus.batch) * + MathUtil::CeilDivision(params.n32, n) * + MathUtil::CeilDivision(params.m32, m) * + MathUtil::CeilDivision(params.k32, k)); + if (IsInvalidFactor(needCoreNum) == false) { + coreStatus.n = n; + coreStatus.m = m; + coreStatus.k = k; + } + } + + params.nonFactorK = params.k32 == params.kMapped ? false : true; + UpdateMultiCore(opType, params, coreStatus, blockDimRes); +} + +void MatmulTilingAlgorithm::NonFactorMap(const std::string& opType, MatmulRunParas& param, + BlockDimCalculator& blockDimRes) const +{ + (void)(opType); + param.batchMapped = param.batch32; + param.mMapped = param.m32; + param.kMapped = param.k32; + param.nMapped = param.n32; + // Split k will introduce atomic_add which can't be used with shift_inwards. + // Thus in split k mode, batch/m/n/ can't use non-factorial segmentation. + if (tilingIns_->enableSplitK_) { + // it is only necessary to consider the non-factor splitting of k when splitKFlag is true + int32_t kFactorLess64Cnt = 0; + int32_t kFactorLess1024Cnt = 0; + MathUtil::GetFactorCnt(param.k32, kFactorLess64Cnt, 1, L0_FACTOR_LIMIT); + MathUtil::GetFactorCnt(param.k32, kFactorLess1024Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT); + if ((param.k32 > L0_FACTOR_LIMIT && kFactorLess64Cnt <= L0_FACTOR_NUM_LIMIT) || + (param.k32 > L1_FACTOR_LIMIT && kFactorLess64Cnt + kFactorLess1024Cnt <= L1_FACTOR_NUM_LIMIT)) { + // Non-factors of the k dimension use a down-aligned number of powers of 2 + param.kMapped = MathUtil::MapShape(param.k32, false); + } + } else { + MathUtil::GetFactorCnt(param.batch32, blockDimRes.batchFactorCnt, 1, numOfBlock_); + if (param.batch32 > 1 && blockDimRes.batchFactorCnt <= L0_FACTOR_NUM_LIMIT) { + param.batchMapped = MathUtil::MapShape(param.batch32); + } + param.mMapped = MathUtil::MapShape(param.m32); + param.nMapped = MathUtil::MapShape(param.n32); + } +} + +void MatmulTilingAlgorithm::FillParam(MatmulRunParas& param) +{ + param.oriShapeM = tilingIns_->orgM; + param.oriShapeN = tilingIns_->orgN; + param.oriShapeKa = tilingIns_->orgKa; + param.oriShapeKb = tilingIns_->orgKb; + int32_t realM = 1; + int32_t realN = 1; + int32_t realK = 1; + + if (tilingIns_->singleCoreM != -1 || tilingIns_->singleCoreK != -1 || tilingIns_->singleCoreN != -1) { + realM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; + realK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; + realN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; + singelBlockDim_ = true; + numOfBlock_ = 1; + } else { + realM = GetSingleM(); + realK = GetSingleK(); + realN = GetSingleN(); + singelBlockDim_ = false; + numOfBlock_ = tilingIns_->blockDim; + } + + const int32_t reduceBlockSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + param.k32 = MathUtil::CeilDivision(realK, reduceBlockSize); + param.m32 = MathUtil::CeilDivision(realM, C0_SIZE); + param.n32 = MathUtil::CeilDivision(realN, C0_SIZE); + param.mMapped = MathUtil::MapShape(param.m32, true); + param.kMapped = MathUtil::MapShape(param.k32, true); + param.nMapped = MathUtil::MapShape(param.n32, true); +} + +bool MatmulTilingAlgorithm::CheckFinaleParams(const CoreStatusPack& coreStatus) const +{ + (void)coreStatus; + const int32_t stepM = tilingIns_->tiling_.get_stepM(); + const int32_t stepN = tilingIns_->tiling_.get_stepN(); + const int32_t depthA1 = tilingIns_->tiling_.get_depthA1(); + const int32_t depthB1 = tilingIns_->tiling_.get_depthB1(); + + const int32_t l1Size = tilingIns_->tiling_.get_shareL1Size(); + const int32_t l0CSize = tilingIns_->tiling_.get_shareL0CSize(); + const int32_t uBSize = tilingIns_->tiling_.get_shareUbSize(); + + if (stepM == 0 || stepN == 0 || depthA1 == 0 || depthB1 == 0) { + TILING_LOG_WARNING("stepM/N depthA1/B1 should greate then zeros"); + return false; + } + + if (stepM > depthA1 || stepN > depthB1) { + TILING_LOG_WARNING("stepM/N should less then depthA1/B1"); + return false; + } + + if (l1Size > tilingIns_->bufferPool_.l1Size || l0CSize > tilingIns_->bufferPool_.l0CSize || + uBSize > tilingIns_->bufferPool_.ubSize) { + TILING_LOG_WARNING("L1/L0C/UB used size should less then L1Size/L0CSize/UbSize"); + return false; + } + + int dateDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); + int32_t biasL1Size = tilingIns_->isBias ? + tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE : 0; + if (!tilingIns_->isBMNKBmm && tilingIns_->tiling_.get_BatchNum() > 0 && + ((tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK() + + tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK()) * + tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE + biasL1Size > + tilingIns_->bufferPool_.l1Size)) { + TILING_LOG_WARNING("a/b matrix size of batch mm should less then L1Size"); + return false; + } + + return true; +} + +void MatmulTilingAlgorithm::CheckL0DB(SingleCoreStatus& singleCoreStatus, const int32_t baseK) const +{ + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && + tilingIns_->aType_.scalePos == TPosition::TSCM) { + baseM = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + if (tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && + tilingIns_->bType_.scalePos == TPosition::TSCM) { + baseN = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + if (baseM * baseK > tilingIns_->bufferPool_.l0ASize / DB_ON) { + singleCoreStatus.l0Status.dbL0A = DB_OFF; + } + if (baseN * baseK > tilingIns_->bufferPool_.l0BSize / DB_ON) { + singleCoreStatus.l0Status.dbL0B = DB_OFF; + } + if (baseM * baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { + singleCoreStatus.l0Status.dbL0C = DB_OFF; + } +} + +void MatmulTilingAlgorithm::GetMxUsedL1Size(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, + int32_t& dataUsedL1Size, int32_t& scaleUsedL1Size, int32_t& biasUsedL1Size) const +{ + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; + + int32_t depthA1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1; + int32_t depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + dataUsedL1Size = depthA1 * baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE + + depthB1 * baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + // scale is fp8e8m0 + scaleUsedL1Size = depthA1 * baseM * baseK / SCALE_K_SIZE + + depthB1 * baseN * baseK / SCALE_K_SIZE; + // bias is fp32 + int32_t bias = tilingIns_->isBias ? 1 : 0; + biasUsedL1Size = bias * baseN * DTYPE_BIT_TAB.at(tilingIns_->biasType_.dataType) / BITS_PER_BYTE; +} + +void MatmulTilingAlgorithm::AdjustSparseL0Factors(SingleCoreStatus& singleCoreStatus) const +{ + // determine whether the scenario is sparse + if (!tilingIns_->isSparse_) { + TILING_LOG_DEBUG("Not sparse scenario does not need to adjust L0Factors."); + return; + } + + int32_t baseK = + singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + constexpr int32_t sparseBaseKFac = 64; // baseK need to align to 64 on Sparse + if (baseK <= sparseBaseKFac) { + baseK = sparseBaseKFac; + } else { + baseK = MathUtil::AlignDown(baseK, sparseBaseKFac); + } + singleCoreStatus.l0Status.kL0 = + baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + + // check L0A/L0B/L0Csize for L0 DB + CheckL0DB(singleCoreStatus, baseK); +} + +void MatmulTilingAlgorithm::AdjustMxL0Factors(SingleCoreStatus& singleCoreStatus) const +{ + // Determine wherther the scenario is MX. + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + if (!tilingIns_->aType_.hasSetScaleType) { + tilingIns_->aType_.scalePos = tilingIns_->aType_.pos; + tilingIns_->aType_.scaleType = tilingIns_->aType_.type; + tilingIns_->aType_.isScaleTrans = tilingIns_->aType_.isTrans; + } + if (!tilingIns_->bType_.hasSetScaleType) { + tilingIns_->bType_.scalePos = tilingIns_->bType_.pos; + tilingIns_->bType_.scaleType = tilingIns_->bType_.type; + tilingIns_->bType_.isScaleTrans = tilingIns_->bType_.isTrans; + } + // In the NZ scenario, ensure that the base size of the inner axis is 64-aligned downwards. + constexpr int32_t l0Factor = INT4_ALIGN_SIZE / C0_SIZE; + if (tilingIns_->aType_.type == CubeFormat::NZ && tilingIns_->aType_.isTrans) { + if (singleCoreStatus.l0Status.mL0 > l0Factor) { + singleCoreStatus.l0Status.mL0 = singleCoreStatus.l0Status.mL0 / l0Factor * l0Factor; + } + } + if (tilingIns_->bType_.type == CubeFormat::NZ && !tilingIns_->bType_.isTrans) { + if (singleCoreStatus.l0Status.nL0 > l0Factor) { + singleCoreStatus.l0Status.nL0 = singleCoreStatus.l0Status.nL0 / l0Factor * l0Factor; + } + } + // FP8 baseK need must be 64 element aligned + int32_t baseK = + singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if ((tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E5M2 || + tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E4M3FN) && + (tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E5M2 || + tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E4M3FN)) { + baseK = baseK <= MX_BASEK_FACTOR ? MX_BASEK_FACTOR : MathUtil::AlignDown(baseK, MX_BASEK_FACTOR); + singleCoreStatus.l0Status.kL0 = + baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + } + bool mL0NeedAlign = tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && + tilingIns_->aType_.scalePos == TPosition::TSCM; + if (mL0NeedAlign) { + singleCoreStatus.l0Status.mL0 = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT); + } + bool nL0NeedAlign = tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && + tilingIns_->bType_.scalePos == TPosition::TSCM; + if (nL0NeedAlign) { + singleCoreStatus.l0Status.nL0 = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT); + } + // check L0A/L0B/L0CSize for L0DB + CheckL0DB(singleCoreStatus, baseK); +} + +void MatmulTilingAlgorithm::AdjustMxL1Factors(SingleCoreStatus& singleCoreStatus, const int32_t k0Size) const +{ + // determine whether the scenario is MX + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + int32_t dataUsedL1Size = 0; + int32_t scaleUsedL1Size = 0; + int32_t biasUsedL1Size = 0; + GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); + // The existing tiling policy causes the L1 threshold to exceed the threshold. + // Adjust the tiling policy to the basic one. That is, only baseM * baseK + baseN * baseK is cached ai L1. + if (dataUsedL1Size + scaleUsedL1Size + biasUsedL1Size > tilingIns_->bufferPool_.l1Size) { + // checks whether the tiling is valid. + // If the tiling is invalid, the system uses the minimum tiling policy. + singleCoreStatus.l1Status.kAL1 = singleCoreStatus.l0Status.kL0; + singleCoreStatus.l1Status.kBL1 = singleCoreStatus.l0Status.kL0; + singleCoreStatus.l1Status.mAL1 = 1; + singleCoreStatus.l1Status.nBL1 = 1; + } +} + +void MatmulTilingAlgorithm::GetMxScaleFactor(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& mxTypePara) const +{ + // determine whether the scenario is MX + if (tilingIns_->madType_ != MatrixMadType::MXMODE) { + return; + } + int32_t dataUsedL1Size = 0; + int32_t scaleUsedL1Size = 0; + int32_t biasUsedL1Size = 0; + GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); + + uint8_t scaleFactorA = 1; + uint8_t scaleFactorB = 1; + int32_t remainedL1Size = tilingIns_->bufferPool_.l1Size - (dataUsedL1Size + biasUsedL1Size); + int32_t singleCoreK = tilingIns_->tiling_.get_singleCoreK(); + int32_t stepKa = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0); + int32_t stepKb = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0); + int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; + int32_t kStep = MathUtil::CeilDivision(singleCoreK, baseK); + uint8_t maxScaleFactorA = static_cast(MathUtil::CeilDivision(kStep, stepKa)); + uint8_t maxScaleFactorB = static_cast(MathUtil::CeilDivision(kStep, stepKb)); + int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + + // only support in K direction, scale DB same as data. + scaleFactorA = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKa * baseM * baseK / SCALE_K_SIZE)); + scaleFactorB = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKb * baseN * baseK / SCALE_K_SIZE)); + scaleFactorA = scaleFactorA > maxScaleFactorA ? maxScaleFactorA : scaleFactorA; + scaleFactorB = scaleFactorB > maxScaleFactorB ? maxScaleFactorB : scaleFactorB; + + // scaleFactor is in range of [1, 127] + scaleFactorA = scaleFactorA >= static_cast(1) ? scaleFactorA : static_cast(1); + scaleFactorB = scaleFactorB >= static_cast(1) ? scaleFactorB : static_cast(1); + scaleFactorA = scaleFactorA <= SCALE_FACTOR_MAX_VALUE ? scaleFactorA : SCALE_FACTOR_MAX_VALUE; + scaleFactorB = scaleFactorB <= SCALE_FACTOR_MAX_VALUE ? scaleFactorB : SCALE_FACTOR_MAX_VALUE; + + // 8bit: 0~6bit:scaleFactor, 7bit(reserved):double buffer flag + scaleFactorA = scaleFactorA & static_cast(0x7f); + scaleFactorB = scaleFactorB & static_cast(0x7F); + mxTypePara = static_cast(static_cast(mxTypePara) | scaleFactorA); + mxTypePara = static_cast(static_cast(mxTypePara) | static_cast(scaleFactorB << 8U)); +} + +void MatmulTilingAlgorithm::PreprocessL0DB() +{ + dbL0A_ = g_tempCfg.l0aDB; + dbL0B_ = g_tempCfg.l0bDB; + dbL0C_ = g_tempCfg.l0cDB; + if (tilingIns_->baseM != -1) { + const int32_t baseLeftSize = tilingIns_->baseM * C0_BYTE_SIZE; + if (baseLeftSize > tilingIns_->bufferPool_.l0ASize / DB_ON) { + dbL0A_ = DB_OFF; + } + } + if (tilingIns_->baseN != -1) { + const int32_t baseRightSize = tilingIns_->baseN * C0_BYTE_SIZE; + if (baseRightSize > tilingIns_->bufferPool_.l0BSize / DB_ON) { + dbL0B_ = DB_OFF; + } + } + if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { + const int32_t baseMatrixSize = tilingIns_->baseM * tilingIns_->baseN * C0_BYTE_SIZE; + if (baseMatrixSize > tilingIns_->bufferPool_.l0CSize / DB_ON) { + dbL0C_ = DB_OFF; + } + } + return; +} + +void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int32_t &b1LengthCache) const +{ + if (!tilingIns_->enableL1CacheUB || + tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND310P) { + return; + } + int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + a1LengthCache = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); + b1LengthCache = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); + int32_t freeL1Size = tilingIns_->bufferPool_.l1Size - tilingIns_->tiling_.get_depthA1() * a1Length - + tilingIns_->tiling_.get_depthB1() * b1Length; + if (freeL1Size <= 0) { + return; + } + const int32_t splitNum = 2; + int32_t aOrgShapeSize = tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK(); + int32_t bOrgShapeSize = tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK(); + + if ((tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) && + (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM)) { + bool aFullLoad = false; + bool bFullLoad = false; + aFullLoad = aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size / splitNum; + bFullLoad = bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size / splitNum; + if (aFullLoad && bFullLoad) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + tilingIns_->tiling_.set_depthBL1CacheUB(1); + a1LengthCache = aOrgShapeSize; // update + b1LengthCache = bOrgShapeSize; + } else if (aFullLoad) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + a1LengthCache = aOrgShapeSize; + int32_t depthL1CacheUB = b1LengthCache > 0 ? (freeL1Size - aOrgShapeSize) / b1LengthCache : 0; + tilingIns_->tiling_.set_depthBL1CacheUB(depthL1CacheUB); + } else if (bFullLoad) { + tilingIns_->tiling_.set_depthBL1CacheUB(1); + b1LengthCache = bOrgShapeSize; + int32_t depthL1CacheUB = a1LengthCache > 0 ? (freeL1Size - bOrgShapeSize) / a1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthL1CacheUB); + } else { + if (a1LengthCache > freeL1Size) { + int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / b1LengthCache : 0; + tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); + } else if (b1LengthCache > freeL1Size) { + int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / a1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); + } else if (a1LengthCache <= freeL1Size / splitNum && b1LengthCache <= freeL1Size / splitNum) { + int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / splitNum / a1LengthCache : 0; + int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / splitNum / b1LengthCache : 0; + tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); + tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); + } else { + // can only cache one matrix + if (a1LengthCache <= b1LengthCache) { + tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); + } else { + tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); + } + } + } + } else if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) { + if (aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size) { + tilingIns_->tiling_.set_depthAL1CacheUB(1); + a1LengthCache = aOrgShapeSize; + } else if (a1LengthCache > 0) { + tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); + } + } else if (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM) { + if (bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size) { + tilingIns_->tiling_.set_depthBL1CacheUB(1); + b1LengthCache = bOrgShapeSize; + } else if (b1LengthCache > 0) { + tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); + } + } else { + return; + } +} + +int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const +{ + int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + // only bType is f32 need update + if (tilingIns_->bType_.dataType != DataType::DT_FLOAT + || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + return depthB1; + } + uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; + uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; + + uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; + if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { + alignedBaseKM = alignedBaseK * tilingIns_->baseM; + } + // if L1 size is overflow, decrease depthB1 + if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) + > static_cast(tilingIns_->bufferPool_.l1Size)) { + depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; + depthB1 = depthB1 < 1 ? 1 : depthB1; + } + return depthB1; +} +int32_t MatmulTilingAlgorithm::GetSingleM() const +{ + return tilingIns_->singleM != -1 ? tilingIns_->singleM : tilingIns_->orgM; +} +int32_t MatmulTilingAlgorithm::GetSingleN() const +{ + return tilingIns_->singleN != -1 ? tilingIns_->singleN : tilingIns_->orgN; +} +int32_t MatmulTilingAlgorithm::GetSingleK() const +{ + return tilingIns_->singleK != -1 ? tilingIns_->singleK : tilingIns_->orgKa; +} +void MatmulTilingAlgorithm::GetSingleShape(const CoreStatusPack &coreStatus, const MatmulRunParas ¶m, + int32_t &singleCoreM, int32_t &singleCoreN, int32_t &singleCoreK) const +{ + singleCoreM = GetSingleM(); + singleCoreM = MathUtil::CeilDivision(singleCoreM, coreStatus.mDim); + singleCoreN = GetSingleN(); + singleCoreN = MathUtil::CeilDivision(singleCoreN, coreStatus.nDim); + singleCoreK = GetSingleK(); + singleCoreK = MathUtil::CeilDivision(singleCoreK, coreStatus.kDim); + if (singelBlockDim_) { + singleCoreM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; + singleCoreN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; + singleCoreK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; + } + if (numOfBlock_ > 1) { + int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; + int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; + bool needAlign = GetMultiCoreScenario(param) == MultiCoreScenario::SPLIT_MN; + bool needOutputAlign = NeedOutputAlign(singleCoreM, singleCoreN, singleCoreK); + (void)AlignSingleShape(needAlign && (!tilingIns_->bType_.isTrans || needOutputAlign), param.n32 * C0_SIZE, coreStatus.nDim, + bAlignSize, singleCoreN); + (void)AlignSingleShape(needAlign && tilingIns_->aType_.isTrans, param.m32 * C0_SIZE, coreStatus.mDim, + aAlignSize, singleCoreM); + if (tilingIns_->enableSplitK_) { + if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || + tilingIns_->bType_.dataType == DataType::DT_FLOAT) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * FLOAT32_REDUCE_BLOCK_SIZE; + } else if ((tilingIns_->aType_.dataType == DataType::DT_INT8 || + tilingIns_->bType_.dataType == DataType::DT_INT8)) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT8_REDUCE_BLOCK_SIZE; + } else if ((tilingIns_->aType_.dataType == DataType::DT_INT4 || + tilingIns_->bType_.dataType == DataType::DT_INT4)) { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT4_REDUCE_BLOCK_SIZE; + } else { + singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * REDUCE_BLOCK_SIZE; + } + } + } +} + +bool MatmulTilingAlgorithm::CheckSingleShape(int32_t singleCoreM, int32_t singleCoreN, int32_t singleCoreK) const +{ + (void)singleCoreM; + (void)singleCoreK; + if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { + // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad + if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && + (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { + TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " + "aligned, matrix c not support ND format"); + return false; + } + } + return true; +} + +int64_t MatmulTilingAlgorithm::Process() +{ + PreprocessL0DB(); + if (!CheckBaseMN()) { + TILING_LOG_WARNING("check baseM/baseN not pass"); + return -1; + } + singelBlockDim_ = false; + splitCoreFlag_ = false; + CoreStatusPack coreStatus; + SingleCoreStatus singleCoreStatus; + MatmulRunParas param; + BlockDimCalculator blockDimRes; + FillParam(param); + + std::string opType = "MatMul"; + if (numOfBlock_ != 1) { + NonFactorMap(opType, param, blockDimRes); + if (DoMultiCoreSplitMNTiling(param, coreStatus, blockDimRes)) { + return 0; + } + GetBlockDim(opType, param, coreStatus, blockDimRes); + } else { + if (!g_tempCfg.factorSplit) { + coreStatus.m = param.m32; + coreStatus.k = param.k32; + coreStatus.n = param.n32; + } else { + coreStatus.m = MathUtil::FindBestSingleCore(param.m32, param.mMapped, 1, false); + coreStatus.k = MathUtil::FindBestSingleCore(param.k32, param.kMapped, 1, false); + coreStatus.n = MathUtil::FindBestSingleCore(param.n32, param.nMapped, 1, false); + } + coreStatus.batchDim = 1; + coreStatus.mDim = 1; + coreStatus.kDim = 1; + coreStatus.nDim = 1; + } + + if (numOfBlock_ != 1 && tilingIns_->bType_.pos == TPosition::TSCM) { + if (!splitCoreFlag_) { + TILING_LOG_WARNING("Multi core split B TSCM full loaded is not sucess."); + return 1; + } + } + // single-core logic + GetL0Factors(opType, param, coreStatus, singleCoreStatus); + AdjustSparseL0Factors(singleCoreStatus); + AdjustMxL0Factors(singleCoreStatus); + if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || + singleCoreStatus.l0Status.kL0 == 0) { + TILING_LOG_WARNING("ml0/nl0/kl0 is zero"); + return -1; + } + GetL1Factors(opType, param, coreStatus, singleCoreStatus.l0Status, singleCoreStatus.l1Status); + if (UpdateTiling(param, coreStatus, singleCoreStatus) == -1L) { + return -1L; + } + const bool ans = CheckFinaleParams(coreStatus); + return ans ? 0 : -1; +} +} // namespace matmul_tiling \ No newline at end of file -- Gitee From c3869f1f1471ccfc760a7b39ef6b5dd0a27ee704 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 08:37:35 +0000 Subject: [PATCH 51/56] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20im?= =?UTF-8?q?pl/matmul/tiling/matmul=5Ftiling=5Falgorithm=5Fnew2.cpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tiling/matmul_tiling_algorithm_new2.cpp | 1651 ----------------- 1 file changed, 1651 deletions(-) delete mode 100644 impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp diff --git a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp b/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp deleted file mode 100644 index 8a81abcc..00000000 --- a/impl/matmul/tiling/matmul_tiling_algorithm_new2.cpp +++ /dev/null @@ -1,1651 +0,0 @@ -void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) const -{ - constexpr int blockSize = 32; - constexpr int bankLen = 512; - bool isBankConflict = false; - int bankConflictSize = 0; - if (isAMatrix) { - if (tilingIns_->aType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepM() * tilingIns_->tiling_.get_baseM(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKa() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseM() * C0_SIZE * tilingIns_->tiling_.get_stepM() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - } - } else { - if (tilingIns_->bType_.isTrans) { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_baseK(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseN() * C0_SIZE * tilingIns_->tiling_.get_stepN() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } else { - isBankConflict = - MathUtil::CeilDivision(tilingIns_->tiling_.get_stepN() * tilingIns_->tiling_.get_baseN(), C0_SIZE) * - blockSize % bankLen == - 0 ? - true : - false; - bankConflictSize = tilingIns_->tiling_.get_baseK() * C0_SIZE * tilingIns_->tiling_.get_stepKb() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - } - } - if (isBankConflict) { - length = length + bankConflictSize; - } -} - -int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const -{ - int32_t a1Length = 0; - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if (IsUbNd2Nz()) { - // A matrix ND2NZ - if (tilingIns_->aType_.type == CubeFormat::ND) { - a1Length = l0Status.mL0 * C0_SIZE * l0Status.kL0 * reduceSize * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - a1Length = a1Length * MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l1Status.mAL1; - } - // bank conflict - GetBankConflictSize(l1Status, l0Status, a1Length, true); - } - } - return a1Length; -} - -int32_t MatmulTilingAlgorithm::GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const -{ - int32_t b1Length = 0; - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if (IsUbNd2Nz()) { - // B matrix ND2NZ - if (tilingIns_->bType_.type == CubeFormat::ND) { - b1Length = l0Status.nL0 * C0_SIZE * l0Status.kL0 * reduceSize * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - b1Length = b1Length * MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l1Status.nBL1; - } - // bank conflict - GetBankConflictSize(l1Status, l0Status, b1Length, false); - } - } - return b1Length; -} - -bool MatmulTilingAlgorithm::IsUbNd2Nz() const -{ - if (tilingIns_->enVecND2NZ && tilingIns_->mmConfigType == 1 && - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - return true; - } - return false; -} - -void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const -{ - int32_t a1Length = 0; - int32_t b1Length = 0; - int32_t c1Length = 0; - int32_t biasLength = 0; - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - // A matrix ND2NZ - if (tilingIns_->aType_.type == CubeFormat::ND) { - a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - a1Length = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); - } - // bank conflict - GetBankConflictSize(a1Length, true); - } - // B matrix ND2NZ - if (tilingIns_->bType_.type == CubeFormat::ND - || (DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8) && - tilingIns_->bType_.type == CubeFormat::NZ && tilingIns_->bType_.isTrans == false)) { - b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - if (tilingIns_->mmConfigType == 1) { - b1Length = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); - } - // bank conflict - GetBankConflictSize(b1Length, false); - } - // C matrix NZ2ND - if (tilingIns_->cType_.type == CubeFormat::ND || tilingIns_->cType_.pos == TPosition::GM) { - c1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseM() * - DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType); - } - // Bias - if (tilingIns_->isBias && tilingIns_->biasType_.pos != TPosition::VECCALC) { - biasLength = tilingIns_->tiling_.get_baseN() * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - } - // quant tensor - if (DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) == DTYPE_BIT_TAB.at(DataType::DT_INT8)) { - int32_t quantLength = tilingIns_->tiling_.get_baseN() * sizeof(uint64_t); - biasLength = max(quantLength, biasLength); - } - } - - transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); -} - -bool MatmulTilingAlgorithm::CheckBaseMN() const -{ - // check bias table - if ((tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910B || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) && - tilingIns_->isBias && (tilingIns_->baseN > MAX_BIAS_N * C0_SIZE) && tilingIns_->isSupportL0c2Out) { - return false; - } - if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { - return (tilingIns_->baseM * tilingIns_->baseN * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize && - tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); - } - if (tilingIns_->baseM != -1) { - return (tilingIns_->baseM * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseM * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0ASize); - } - if (tilingIns_->baseN != -1) { - return (tilingIns_->baseN * C0_SIZE * FP32_BYTES <= tilingIns_->bufferPool_.l0CSize && - tilingIns_->baseN * C0_BYTE_SIZE <= tilingIns_->bufferPool_.l0BSize); - } - return true; -} - -int32_t MatmulTilingAlgorithm::GetIteratorOrder(const SingleCoreStatus& singleCoreStatus, const int32_t singleCoreM, - const int32_t singleCoreN, const int32_t singleCoreK) const -{ - if (tilingIns_->traverse_ != MatrixTraverse::NOSET) { - return static_cast(tilingIns_->traverse_) - 1; - } - const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - const bool fullkAL1Load = - (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kAL1 * reduceSize)) > 1.0 ? false : true; - bool fullkBL1Load = - (static_cast(singleCoreK) / (singleCoreStatus.l1Status.kBL1 * reduceSize)) > 1.0 ? false : true; - - // if KAL1 and KBL1 both can not be full loaded, then select m or n which is no matter - if (!fullkAL1Load && !fullkBL1Load) { - return static_cast(MatrixTraverse::FIRSTM) - 1; - } else if (fullkAL1Load && !fullkBL1Load) { // if KAL1 is full loaded, then select the order N fist - return static_cast(MatrixTraverse::FIRSTN) - 1; - } else if (!fullkAL1Load && fullkBL1Load) { // if KBL1 is full loaded, then select the order M fist - return static_cast(MatrixTraverse::FIRSTM) - 1; - } else { - // if AL1LoadSize less then BL1LoadSize, then select order N first, vice versa. - const int32_t mLoop = MathUtil::CeilDivision(singleCoreM, - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l0Status.mL0 * C0_SIZE); - const int32_t nLoop = MathUtil::CeilDivision(singleCoreN, - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l0Status.nL0 * C0_SIZE); - const int32_t aL1LoadSize = singleCoreM + singleCoreN * mLoop; - const int32_t bL1LoadSize = singleCoreN + singleCoreM * nLoop; - return aL1LoadSize < bL1LoadSize ? 1 : 0; - } -} - -void MatmulTilingAlgorithm::UpdateBlockDimCalculator(BlockDimCalculator& blockDimRes) const -{ - if (blockDimRes.totalLoadSize > blockDimRes.tmpLoadSize) { - blockDimRes.bmatSize = blockDimRes.tmpBmatSize; - blockDimRes.amatSize = blockDimRes.tmpAmatSize; - blockDimRes.totalLoadSize = blockDimRes.tmpLoadSize; - blockDimRes.tmpValue = 0; - } -} - -void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes, const MatmulRunParas& params) const -{ - blockDimRes.totalLoadSize = INT_MAX; - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - const int32_t totalSize = blockDimRes.amatSize + blockDimRes.bmatSize; // batch==1 - constexpr int32_t minMNSize = 16; - constexpr int32_t minKSize = 64; - constexpr int32_t minTotalSize = 128; - const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 - const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? - min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; - const int32_t dbBuffer = 2; - - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - // loadsize = K*(N*mdim+M*ndim) - const bool bothFullLoad = static_cast(totalSize) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - const bool afulloadPlsBKFullLoad = - static_cast(blockDimRes.amatSize + n0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - const bool bfulloadPlsaKFullLoad = - static_cast(blockDimRes.bmatSize + m0 * dbBuffer) * static_cast(blockDimRes.kBytes) <= - static_cast(tilingIns_->bufferPool_.l1Size); - if (afulloadPlsBKFullLoad || bfulloadPlsaKFullLoad || bothFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A kdim not fullload + B kdim not fullload(9) - // loadsize = M*K*N*(1/m0+1/n0) - const bool aKNotfulloadPlsbKNotFullLoad = - (n0 * blockDimRes.kBytes + m0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > - tilingIns_->bufferPool_.l1Size && - (m0 * blockDimRes.kBytes + n0 * k0 * C0_SIZE * C0_BYTE_SIZE) * dbBuffer > - tilingIns_->bufferPool_.l1Size; - if (aKNotfulloadPlsbKNotFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A kdim fullload + B kdim fullload(5) - // M*K*(ndim+N/m1) or N*K*(mdim+M/n1) - const bool aKfulloadPlsbKFullLoad = (m0 + n0) * blockDimRes.kBytes * dbBuffer <= tilingIns_->bufferPool_.l1Size; - if (aKfulloadPlsbKFullLoad) { - const int32_t m1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - n0 * - blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * m0)) * m0; - const int32_t n1 = MathUtil::CeilDivision((tilingIns_->bufferPool_.l1Size - m0 * - blockDimRes.kBytes * dbBuffer), (blockDimRes.kBytes * dbBuffer * n0)) * n0; - const int32_t mfirstLoad = - blockDimRes.oriAmatSize * blockDims.n + blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); - int32_t nfirstLoad = - blockDimRes.oriBmatSize * blockDims.m + blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); - if (mfirstLoad < nfirstLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m1); - } else { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n1); - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * blockDims.m; - } - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - return; - } - - // A fullload + B Kdim not fullload or A K fullload + B Kdim not fullload(3/6) - // mdim = coreNum; ndim = 1; - // loadsize = M*K*(ndim+N/m0) - const bool afulloadPlsbKNotFullLoad = (blockDimRes.amatSize * blockDimRes.kBytes + - n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - const bool aKfulloadPlsbKNotFullLoad = (m0 * blockDimRes.kBytes * dbBuffer + - n0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - if (afulloadPlsbKNotFullLoad || aKfulloadPlsbKNotFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriAmatSize * blockDims.n; - blockDimRes.tmpBmatSize = blockDimRes.oriBmatSize * MathUtil::CeilDivision(params.m32, m0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - } - - // A kdim not fullload + B fullload or A kdim not fullload + B kdim fullload(7/8) - // loadsize = N*K*(mdim+M/n0) - const bool aKNotfulloadPlsbFullLoad = (blockDimRes.bmatSize * blockDimRes.kBytes + - m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - const bool aKNotfulloadPlsbKFullLoad = (n0 * blockDimRes.kBytes * dbBuffer + - m0 * k0 * C0_SIZE * C0_BYTE_SIZE * dbBuffer) <= tilingIns_->bufferPool_.l1Size; - if (aKNotfulloadPlsbFullLoad || aKNotfulloadPlsbKFullLoad) { - blockDimRes.tmpAmatSize = blockDimRes.oriBmatSize * blockDims.m; - blockDimRes.tmpBmatSize = blockDimRes.oriAmatSize * MathUtil::CeilDivision(params.n32, n0); - blockDimRes.tmpLoadSize = blockDimRes.tmpAmatSize + blockDimRes.tmpBmatSize; - UpdateBlockDimCalculator(blockDimRes); - } -} - -int32_t MatmulTilingAlgorithm::LoopNumFromSingleCoreToL0(const CoreStatusPack& coreStatus, - const DimFactor& blockDimsFactor) const -{ - if (!blockDimsFactor.IsValid()) { - return 0; - } - constexpr int32_t minTotalSize = 128; - constexpr int32_t minSize = 64; - constexpr int32_t minN0Size = 16; - int32_t n0 = min(min(minN0Size, coreStatus.n), minSize); - int32_t m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); - n0 = (m0 == 0) ? 0 : min(min(coreStatus.n, minTotalSize / m0), minSize); - m0 = (n0 == 0) ? 0 : min(min(coreStatus.m, minTotalSize / n0), minSize); - const int32_t k0 = (m0 != 0 && n0 != 0) ? - min(min(minSize / m0, minSize / n0), coreStatus.k) : coreStatus.k; - const int32_t loopNum = MathUtil::CeilDivision(coreStatus.m, m0) * MathUtil::CeilDivision(coreStatus.n, n0) * - MathUtil::CeilDivision(coreStatus.k, k0); - return loopNum; -} - -int32_t MatmulTilingAlgorithm::GetBigPackageCondition(const CoreStatusPack &coreStatus, - const BlockDimCalculator &blockDimRes, const MatmulRunParas ¶ms) const -{ - if (tilingIns_->bType_.isTrans == true && tilingIns_->aType_.isTrans == false) { - return ATTACH_FLAG_ZERO; - } - const int minSize = 16; - bool flag = true; - if (tilingIns_->bType_.isTrans == false) { - if (params.n32 >= minSize && coreStatus.n < minSize) { - flag = false; - } - } - if (tilingIns_->aType_.isTrans) { - if (params.m32 >= minSize && coreStatus.m < minSize) { - flag = false; - } - } - - if (!blockDimRes.bigPackage && !flag) { - return ATTACH_FLAG_ZERO; - } else if (!blockDimRes.bigPackage && flag) { - return ATTACH_FLAG_TWO; - } else if (blockDimRes.bigPackage && !flag) { - return ATTACH_FLAG_ONE; - } else { - return ATTACH_FLAG_ZERO; - } -} - -void MatmulTilingAlgorithm::GetBlockDimHelper(const DimFactor& blockDim, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes, const MatmulRunParas& params) -{ - blockDimRes.kNum = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k * C0_SIZE * REDUCE_BLOCK_SIZE); // contain k * 16 - blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 - coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDim.batch); - coreStatus.m = MathUtil::CeilDivision(params.m32, blockDim.m); - coreStatus.n = MathUtil::CeilDivision(params.n32, blockDim.n); - coreStatus.k = (blockDim.k == 0) ? 0 : (params.k32 / blockDim.k); - if (tilingIns_->enableSplitK_) { - if (params.kMapped != params.k32) { // need check--splitK - blockDimRes.kNum = params.kMapped / blockDim.k * NUM_TWO * C0_SIZE * REDUCE_BLOCK_SIZE; - coreStatus.k = params.kMapped / blockDim.k * NUM_TWO; - } - } - - // load size of A matrix is batch * m - // load size of B matrix is n - blockDimRes.oriAmatSize = params.batch32 * params.m32; - blockDimRes.oriBmatSize = params.oriShapeBbatch > 1 ? params.batch32 * params.n32 : params.n32; - blockDimRes.amatSize = coreStatus.batch * coreStatus.m; - blockDimRes.bmatSize = params.oriShapeBbatch > 1 ? coreStatus.batch * coreStatus.n : coreStatus.n; - blockDimRes.tmpValue = 0; - CalcLoadSize(blockDim, coreStatus, blockDimRes, params); - if (tilingIns_->enableSplitK_) { - blockDimRes.totalLoadSize *= coreStatus.k; - } - - // updateSolution: bool whether update to a new block factor solution - // has smaller LoadSize or the same LoadSize but batch - const int bigpackageFlag = GetBigPackageCondition(coreStatus, blockDimRes, params); - const bool updateConditionBp = bigpackageFlag == 0 ? false : true; - bool updateConditionBp2 = bigpackageFlag == 2 ? true : false; - bool updateConditionBp3 = bigpackageFlag == 1 ? false : true; - - const int32_t loopNum = LoopNumFromSingleCoreToL0(coreStatus, blockDim); - const bool updateConditionCoreUsed = (!updateConditionBp) && ((loopNum < blockDimRes.loopNumToL0) || - (blockDim.ReduceMul() > blockDimRes.coreUse && loopNum == blockDimRes.loopNumToL0)); - const bool updateConditionLoadsize = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse) && - blockDimRes.totalLoadSize < blockDimRes.minLoadSize; - const int32_t orgBatchM = params.oriShapeAbatch > 1 ? blockDimRes.batchDimFactor : blockDimRes.mDimFactor; - const int32_t curBatchM = params.oriShapeAbatch > 1 ? blockDim.batch : blockDim.m; - const bool updateConditionBatchNDim = (!updateConditionCoreUsed && blockDim.ReduceMul() == blockDimRes.coreUse && - blockDimRes.totalLoadSize == blockDimRes.minLoadSize) && - ((blockDimRes.nDimFactor * orgBatchM < curBatchM * blockDim.n) || - (blockDimRes.nDimFactor * orgBatchM == curBatchM * blockDim.n && - blockDimRes.batchDimFactor < blockDim.batch)); - - const bool policyCondition = - UserPolicy(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, - coreStatus, blockDimRes); - if ((updateConditionBp2 || updateConditionCoreUsed || updateConditionLoadsize || updateConditionBatchNDim) && - policyCondition && updateConditionBp3) { - blockDimRes.minLoadSize = blockDimRes.totalLoadSize; - blockDimRes.nDimFactor = blockDim.n; - blockDimRes.batchDimFactor = blockDim.batch; - blockDimRes.mDimFactor = blockDim.m; - blockDimRes.kDimFactor = blockDim.k; - blockDimRes.coreUse = blockDim.ReduceMul(); - blockDimRes.loopNumToL0 = loopNum; - blockDimRes.finalValue = blockDimRes.tmpValue; - const int32_t minSize = 16; - blockDimRes.bigPackage = (!tilingIns_->bType_.isTrans ? coreStatus.n >= minSize : true) && - (tilingIns_->aType_.isTrans ? coreStatus.m >= minSize : true) && (blockDim.n * blockDim.m * blockDim.k > 1); - splitCoreFlag_ = true; - } -} - -bool MatmulTilingAlgorithm::UserPolicy(const TilingPolicy policy, const CoreStatusPack& coreStatus, - const BlockDimCalculator& blockDimRes) const -{ - constexpr int32_t minMNSize = 16; - constexpr int32_t minKSize = 64; - constexpr int32_t minTotalSize = 128; - const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 - const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; - - if (policy == TilingPolicy::FIXED_B_TSCM) { - const int32_t alignFactor = MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE); - if (coreStatus.n < alignFactor) { - return false; - } - const int32_t alignNLength = MathUtil::Align(coreStatus.n, alignFactor); - const int32_t bMatrixSize = alignNLength * blockDimRes.kBytes * 2; - int32_t aMatrixSize = m0 * k0 * C0_SIZE * C0_BYTE_SIZE; - int32_t biasSize = 0; - if (tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { - biasSize = alignNLength * C0_SIZE * DTYPE_BYTE_TAB.at(tilingIns_->biasType_.dataType); - } - if (bMatrixSize + aMatrixSize + biasSize <= tilingIns_->bufferPool_.l1Size) { - return true; - } else { - return false; - } - } else if (policy == TilingPolicy::FIXED_A_TSCM) { - return false; - } else if (policy == TilingPolicy::FIXED_A_B_TSCM) { - return false; - } else { - return true; - } -} - -bool MatmulTilingAlgorithm::PreProcessMiniShape(const std::string& opType, CoreStatusPack& coreStatus, - MatmulRunParas& params, const int32_t& coreNum, bool splitKFlag) const -{ - (void)(opType); - // experience value for mini shape - const int32_t miniL0cThreshold = tilingIns_->bufferPool_.l0CSize / MIN_FRACTAL_SIZE / FP32_BYTES; - const int32_t miniL0abThreshold = tilingIns_->bufferPool_.l0ASize / (C0_SIZE * C0_BYTE_SIZE); - // tend to use less cores for shapes with batch less than coreNum and m/k/n can full load in - // aicore buffers split_k is conflict with m/n shift_inwards - bool specialScenario = false; - if (params.n32 > MIN_MTE1_LOAD) { - specialScenario = specialScenario || - (splitKFlag && ((static_cast(params.nMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); - } - if (params.m32 > MIN_MTE1_LOAD) { - specialScenario = specialScenario || - (splitKFlag && ((static_cast(params.mMapped) & static_cast(MIN_MTE1_LOAD - 1)) != 0)); - } - - if (params.batch32 * params.n32 * params.m32 <= coreNum && params.m32 * params.k32 <= miniL0abThreshold && - params.n32 * params.k32 <= miniL0abThreshold && params.m32 * params.n32 <= miniL0cThreshold && - !specialScenario) { - coreStatus.batchDim = params.batch32; - coreStatus.nDim = params.n32 <= MIN_MTE1_LOAD ? 1 : params.nMapped / MIN_MTE1_LOAD; - coreStatus.mDim = params.m32 <= MIN_MTE1_LOAD ? 1 : params.mMapped / MIN_MTE1_LOAD; - int32_t kDimCandidate[2] = {0}; // storage 2 factors of k around kDim - GetTwoFactors(kDimCandidate, coreStatus.kDim, params.k32, coreNum); - coreStatus.kDim = (params.k32 <= MIN_MTE1_LOAD || !splitKFlag) ? - 1 : - (kDimCandidate[1] > 1 ? kDimCandidate[1] : kDimCandidate[0]); - coreStatus.batch = 1; - coreStatus.n = coreStatus.nDim == 1 ? params.n32 : MathUtil::CeilDivision(params.nMapped, coreStatus.nDim); - coreStatus.m = coreStatus.mDim == 1 ? params.m32 : MathUtil::CeilDivision(params.mMapped, coreStatus.mDim); - coreStatus.k = coreStatus.kDim == 1 ? params.k32 : MathUtil::CeilDivision(params.kMapped, coreStatus.kDim); - params.nonFactorK = (coreStatus.kDim == 0) ? false : (params.k32 % coreStatus.kDim == 0 ? false : true); - return true; - } - return false; -} -float MatmulTilingAlgorithm::CalculateBlockCycles(int32_t baseM, int32_t baseN, int32_t baseK) const -{ - const int32_t reduceBlockSize = C0_BYTE_SIZE * BITS_PER_BYTE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - return static_cast(baseM * baseN * baseK) / (C0_SIZE * C0_SIZE * reduceBlockSize); -} - -int32_t MatmulTilingAlgorithm::CalculateMemoryTraffic(int32_t baseM, int32_t baseN, int32_t baseK) const -{ - int32_t aMatrixSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - int32_t bMatrixSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - return aMatrixSize + bMatrixSize; -} - -bool MatmulTilingAlgorithm::AlignSingleShape(bool needAlign, int32_t orgShape, int32_t factor, int32_t alignSize, - int32_t &singleShape) const -{ - singleShape = MathUtil::CeilDivision(orgShape, factor); - if (!needAlign || alignSize == 0 || orgShape % alignSize != 0) { - return true; // orgShape not align, don't need to adjust - } - if (factor <= 1) { - return true; - } - int32_t maxSingleShape = MathUtil::CeilDivision(orgShape, factor - 1); - int32_t alignSingleShape = MathUtil::Align(singleShape, alignSize); - if (alignSingleShape >= maxSingleShape) { - return false; - } - singleShape = alignSingleShape; - return true; -} - -ComputeBaseBlock MatmulTilingAlgorithm::GetMultiCoreBasicBlock(const MatmulRunParas& params) const -{ - (void)params; - constexpr static int32_t l0c256KB = 262144; - constexpr static int32_t basicSize128 = 128; - constexpr static int32_t basicSize256 = 256; - int32_t basicM = basicSize128; - if (tilingIns_->bufferPool_.l0CSize == l0c256KB) { - basicM = basicSize256; - } - int32_t basicN = basicSize256; - int32_t aDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) != 0 ? - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) : 1; - int32_t basicK = basicSize128 * BITS_PER_BYTE / aDtypeSize; - ComputeBaseBlock basicBlock {basicM, basicN, basicK}; - // SetFixSplit - if (tilingIns_->baseM != -1) { - basicBlock.baseM = tilingIns_->baseM; - } - if (tilingIns_->baseN != -1) { - basicBlock.baseN = tilingIns_->baseN; - } - if (!tilingIns_->aType_.isTrans && !tilingIns_->bType_.isTrans) { - return basicBlock; - } - if (tilingIns_->aType_.isTrans && tilingIns_->bType_.isTrans) { - basicBlock.baseM = tilingIns_->baseM != -1 ? basicBlock.baseM : basicSize256; - basicBlock.baseN = tilingIns_->baseN != -1 ? basicBlock.baseN : basicSize128; - return basicBlock; - } - - return basicBlock; -} - -float MatmulTilingAlgorithm::CalcBaseBlockBandRatio(int32_t mDim, int32_t nDim, const ComputeBaseBlock &baseBlock) const -{ - float bandRatio = static_cast((numOfBlock_ - mDim) * baseBlock.baseM + (numOfBlock_ - nDim) * baseBlock.baseN) / - static_cast((baseBlock.baseM + baseBlock.baseN) * numOfBlock_); - return bandRatio; -} - -ComputeIntensity MatmulTilingAlgorithm::CalcComputeIntensity(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, - const std::pair &factor) const -{ - auto mFactor = factor.first; - auto nFactor = factor.second; - int32_t sm = 0; - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - bool alignSuccA = AlignSingleShape(tilingIns_->aType_.isTrans, params.oriShapeM, mFactor, aAlignSize, sm); - int32_t sn = 0; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool alignSuccB = AlignSingleShape(!tilingIns_->bType_.isTrans, params.oriShapeN, nFactor, bAlignSize, sn); - auto shapeM = MathUtil::DivideIntoMainAndTail(sm, baseBlock.baseM); - auto shapeN = MathUtil::DivideIntoMainAndTail(sn, baseBlock.baseN); - auto mainM = shapeM.first; - auto tailM = shapeM.second; - auto mainN = shapeN.first; - auto tailN = shapeN.second; - int32_t memoryRatio = (alignSuccA && alignSuccB) ? 1 : 2; - float bandRatio = CalcBaseBlockBandRatio(mFactor, nFactor, baseBlock); - std::vector blocks; - // Main Chunk - if (mainM > 0 && mainN > 0) { - int count = mainM * mainN; - float cycles = CalculateBlockCycles(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; - int32_t memory = memoryRatio * - CalculateMemoryTraffic(baseBlock.baseM, baseBlock.baseN, baseBlock.baseK) * count; - blocks.push_back({count, cycles, memory}); - } - // N Tail Chunk - if (mainM > 0 && tailN > 0) { - float cycles = CalculateBlockCycles(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; - int32_t memory = memoryRatio * CalculateMemoryTraffic(baseBlock.baseM, tailN, baseBlock.baseK) * mainM; - blocks.push_back({mainM, cycles, memory}); - } - // M Tail Chunk - if (tailM > 0 && mainN > 0) { - float cycles = CalculateBlockCycles(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; - int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, baseBlock.baseN, baseBlock.baseK) * mainN; - blocks.push_back({mainN, cycles, memory}); - } - // M and N Tail Chunk - if (tailM > 0 && tailN > 0) { - float cycles = CalculateBlockCycles(tailM, tailN, baseBlock.baseK); - int32_t memory = memoryRatio * CalculateMemoryTraffic(tailM, tailN, baseBlock.baseK); - blocks.push_back({1, cycles, memory}); - } - float totalCycles = 0; - int32_t totalMemory = 0; - for (const auto& v : blocks) { - totalCycles += v.computeCycle; - totalMemory += v.memoryTraffic; - } - return { - {mFactor, nFactor}, totalCycles, (totalMemory != 0) ? totalCycles / totalMemory : 0, bandRatio}; -} - -MultiCoreScenario MatmulTilingAlgorithm::GetMultiCoreScenario(const MatmulRunParas& params) const -{ - if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { - return MultiCoreScenario::OTHERS; - } - if (tilingIns_->enableSplitK_ || tilingIns_->singleM != -1 || tilingIns_->singleN != -1) { - return MultiCoreScenario::OTHERS; - } - constexpr int64_t mnLimit = 26214; // 128 * 256 * 0.8 - constexpr int64_t mLimit = 128; - if (params.oriShapeM >= mLimit && params.oriShapeM * params.oriShapeN > mnLimit * numOfBlock_) { - return MultiCoreScenario::SPLIT_MN; - } - return MultiCoreScenario::OTHERS; -} - -void MatmulTilingAlgorithm::UpdateStepK(const ComputeBaseBlock &baseBlock, int32_t &stepK) const -{ - if (stepK * baseBlock.baseK >= GetSingleK()) { - return; - } - constexpr static int32_t baseBlockSize512 = 512; - constexpr static int32_t baseBlockSize256 = 256; - int32_t aTypeBitSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize512) { - if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0) && - (baseBlockSize512 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { - while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize512 != 0 && stepK > 1) { - stepK--; - } - } - } else if (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE > baseBlockSize256) { - if ((stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0) && - (baseBlockSize256 % (baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE) == 0)) { - while (stepK * baseBlock.baseK * aTypeBitSize / BITS_PER_BYTE % baseBlockSize256 != 0 && stepK > 1) { - stepK--; - } - } - } -} - -void MatmulTilingAlgorithm::CalcL1Tiling(const ComputeBaseBlock &baseBlock, int32_t &depthA1, int32_t &depthB1, - int32_t &stepKa, int32_t &stepKb) -{ - int32_t l1Size = tilingIns_->bufferPool_.l1Size; - constexpr static int32_t reservedL1Size = 256; // l1 reserved 256B - int32_t depthA1Size = (l1Size / DB_ON / baseBlock.baseM / baseBlock.baseK) * BITS_PER_BYTE / - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - int32_t depthB1Size = ((l1Size + reservedL1Size) / DB_ON / baseBlock.baseN / baseBlock.baseK) * BITS_PER_BYTE / - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); - int32_t btSize = tilingIns_->isBias ? tilingIns_->bufferPool_.btSize / BITS_PER_BYTE : 0; - if (depthA1Size + depthB1Size > l1Size - btSize) { - if (baseBlock.baseM <= baseBlock.baseN) { - depthA1Size = depthA1Size / DB_ON; - } else { - depthB1Size = depthB1Size / DB_ON; - } - } - int32_t l1Db = g_tempCfg.l1DB == DB_OFF ? DB_OFF : DB_ON; - stepKa = depthA1Size / l1Db; - stepKb = depthB1Size / l1Db; - UpdateStepK(baseBlock, stepKa); - UpdateStepK(baseBlock, stepKb); - if (stepKa >= stepKb && stepKb != 0) { - stepKa = stepKa / stepKb * stepKb; - } else if (stepKa != 0) { - stepKb = stepKb / stepKa * stepKa; - } - depthA1 = stepKa * l1Db; - depthB1 = stepKb * l1Db; -} - -L0StatusPack MatmulTilingAlgorithm::GetL0CoreStatus(const ComputeBaseBlock &baseBlock) const -{ - L0StatusPack l0Status; - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - l0Status.dbL0C = g_tempCfg.l0cDB; - if (baseBlock.baseM * baseBlock.baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { - l0Status.dbL0C = DB_OFF; - } - l0Status.dbL0A = DB_ON; - l0Status.dbL0B = DB_ON; - l0Status.mL0 = baseBlock.baseM / C0_SIZE; - l0Status.kL0 = baseBlock.baseK / reduceSize; - l0Status.nL0 = baseBlock.baseN / C0_SIZE; - return l0Status; -} - -L1StatusPack MatmulTilingAlgorithm::GetL1CoreStatus(const ComputeBaseBlock &baseBlock, int32_t depthA1, int32_t depthB1, - int32_t stepKa, int32_t stepKb) const -{ - L1StatusPack l1Status; - l1Status.mAL1 = 1; - l1Status.nBL1 = 1; - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - l1Status.kAL1 = baseBlock.baseK / reduceSize * stepKa; - l1Status.kBL1 = baseBlock.baseK / reduceSize * stepKb; - l1Status.dbAL1 = depthA1 >= stepKa * DB_ON ? DB_ON : DB_OFF; - l1Status.dbBL1 = depthB1 >= stepKb * DB_ON ? DB_ON : DB_OFF; - return l1Status; -} - -void MatmulTilingAlgorithm::UpdateShapeAndLayout() const -{ - tilingIns_->tiling_.set_M(tilingIns_->orgM); - tilingIns_->tiling_.set_N(tilingIns_->orgN); - tilingIns_->tiling_.set_Ka(tilingIns_->orgKa); - tilingIns_->tiling_.set_Kb(tilingIns_->orgKb); - tilingIns_->tiling_.set_batchM(tilingIns_->batchM); - tilingIns_->tiling_.set_batchN(tilingIns_->batchN); - tilingIns_->tiling_.set_singleBatchM(tilingIns_->singleBatchM); - tilingIns_->tiling_.set_singleBatchN(tilingIns_->singleBatchN); - - tilingIns_->tiling_.set_ALayoutInfoB(tilingIns_->aLayoutInfoB); - tilingIns_->tiling_.set_ALayoutInfoS(tilingIns_->aLayoutInfoS); - tilingIns_->tiling_.set_ALayoutInfoN(tilingIns_->aLayoutInfoN); - tilingIns_->tiling_.set_ALayoutInfoG(tilingIns_->aLayoutInfoG); - tilingIns_->tiling_.set_ALayoutInfoD(tilingIns_->aLayoutInfoD); - - tilingIns_->tiling_.set_BLayoutInfoB(tilingIns_->bLayoutInfoB); - tilingIns_->tiling_.set_BLayoutInfoS(tilingIns_->bLayoutInfoS); - tilingIns_->tiling_.set_BLayoutInfoN(tilingIns_->bLayoutInfoN); - tilingIns_->tiling_.set_BLayoutInfoG(tilingIns_->bLayoutInfoG); - tilingIns_->tiling_.set_BLayoutInfoD(tilingIns_->bLayoutInfoD); - - tilingIns_->tiling_.set_CLayoutInfoB(tilingIns_->cLayoutInfoB); - tilingIns_->tiling_.set_CLayoutInfoS1(tilingIns_->cLayoutInfoS1); - tilingIns_->tiling_.set_CLayoutInfoN(tilingIns_->cLayoutInfoN); - tilingIns_->tiling_.set_CLayoutInfoG(tilingIns_->cLayoutInfoG); - tilingIns_->tiling_.set_CLayoutInfoS2(tilingIns_->cLayoutInfoS2); - tilingIns_->tiling_.set_BatchNum(tilingIns_->batchNum); - return; -} - -void MatmulTilingAlgorithm::UpdateUsedSize() const -{ - int32_t transLength = 0; - GetTransLength(transLength); - int32_t a1LengthCache = 0; - int32_t b1LengthCache = 0; - SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub - tilingIns_->tiling_.set_shareMode(0); - int32_t l1Size = 0; - int32_t l0cSize = 0; - int32_t ubSize = 0; - GetUsedSize(l1Size, l0cSize, ubSize, a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_shareL1Size(l1Size); - tilingIns_->tiling_.set_shareL0CSize(l0cSize); - tilingIns_->tiling_.set_shareUbSize(ubSize); -} - -int64_t MatmulTilingAlgorithm::AdjustOuterProductL0Factor(const SingleCoreStatus& singleCoreStatus) const -{ - if (tilingIns_->scheduleType != ScheduleType::OUTER_PRODUCT) { - return 0; - } - // check whether OUTER_PRODUCT is supported - if ((tilingIns_->tiling_.get_baseK() < tilingIns_->tiling_.get_singleCoreK()) && - ((tilingIns_->mmConfigType == 1) || ((tilingIns_->mmConfigType == 0) && - (tilingIns_->batchNum != 0)))) { - TILING_LOG_WARNING("Unsupported scheduleType is OUTER_PRODUCT"); - return -1L; - } - int32_t newBaseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t newBaseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - // when scheduleType is OUTER_PRODUCT, each iteration computes 2 * basicBlock size of data - bool isL0CFullUsed = (newBaseM * newBaseN * NUM_TWO * - static_cast(DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType))) > - static_cast(tilingIns_->bufferPool_.l0CSize) ? true : false; - if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 0)) { - // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_M, N db in L0 - newBaseN = MathUtil::Align(newBaseN / NUM_TWO, C0_SIZE); - } else if (isL0CFullUsed && (tilingIns_->tiling_.get_iterateOrder() == 1)) { - // when scheduleType is OUTER_PRODUCT and iterateOrder is ORDER_N, M db in L0 - newBaseM = MathUtil::Align(newBaseM / NUM_TWO, C0_SIZE); - } - tilingIns_->tiling_.set_baseM(newBaseM); - tilingIns_->tiling_.set_baseN(newBaseN); - return 0; -} - -void MatmulTilingAlgorithm::AdjustFloatL1Factor(const SingleCoreStatus& singleCoreStatus) const -{ - if (DTYPE_BYTE_TAB.at(tilingIns_->bType_.dataType) == DTYPE_BYTE_TAB.at(DataType::DT_FLOAT)) { - if (tilingIns_->tiling_.get_baseK() == DT_FLOAT_INVALID_BASEK) { - tilingIns_->tiling_.set_stepKb(1); - tilingIns_->tiling_.set_depthB1(singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); - } - } -} - -int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) -{ - int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : - coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; - int32_t singleCoreM; - int32_t singleCoreN; - int32_t singleCoreK; - GetSingleShape(coreStatus, param, singleCoreM, singleCoreN, singleCoreK); - if (!CheckSingleShape(singleCoreM, singleCoreN, singleCoreK)) { - return -1L; - } - tilingIns_->tiling_.set_usedCoreNum(coreUse); - tilingIns_->tiling_.set_singleCoreM(singleCoreM); - tilingIns_->tiling_.set_singleCoreN(singleCoreN); - tilingIns_->tiling_.set_singleCoreK(singleCoreK); - UpdateShapeAndLayout(); - tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE); - tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE); - const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); - tilingIns_->tiling_.set_iterateOrder(GetIteratorOrder(singleCoreStatus, singleCoreM, singleCoreN, singleCoreK)); - // check whether OUTER_PRODUCT is supported - if (AdjustOuterProductL0Factor(singleCoreStatus) != 0) { - return -1L; - } - tilingIns_->baseM = tilingIns_->tiling_.get_baseM(); - tilingIns_->baseN = tilingIns_->tiling_.get_baseN(); - tilingIns_->baseK = tilingIns_->tiling_.get_baseK(); - AdjustMxL1Factors(singleCoreStatus, reduceSize); - int32_t mxTypePara = 0; - GetMxScaleFactor(singleCoreStatus, reduceSize, mxTypePara); - tilingIns_->tiling_.set_mxTypePara(mxTypePara); - tilingIns_->tiling_.set_depthA1( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); - tilingIns_->tiling_.set_depthB1(UpdateDepthB1(singleCoreStatus)); - // if decrease depthB1, nBL1 must decrease to ensure nBL1 is less then depthB1 - singleCoreStatus.l1Status.nBL1 = min(singleCoreStatus.l1Status.nBL1, tilingIns_->tiling_.get_depthB1()); - tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); - tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); - tilingIns_->tiling_.set_stepKa( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); - tilingIns_->tiling_.set_stepKb( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); - AdjustFloatL1Factor(singleCoreStatus); - tilingIns_->tiling_.set_isBias(tilingIns_->isBias ? 1 : 0); - tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); - tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); - tilingIns_->tiling_.set_dbL0C(singleCoreStatus.l0Status.dbL0C); - UpdateUsedSize(); - return 0; -} - -bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& params, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes) -{ - if (GetMultiCoreScenario(params) != MultiCoreScenario::SPLIT_MN) { - return false; - } - ComputeBaseBlock baseBlock = GetMultiCoreBasicBlock(params); // calc basic block - CalcMultiCoreBlockDims(params, baseBlock, coreStatus, blockDimRes); - SingleCoreStatus singleCoreStatus; - singleCoreStatus.l0Status = GetL0CoreStatus(baseBlock); - AdjustSparseL0Factors(singleCoreStatus); - AdjustMxL0Factors(singleCoreStatus); - int32_t depthA1; - int32_t depthB1; - int32_t stepKa; - int32_t stepKb; - CalcL1Tiling(baseBlock, depthA1, depthB1, stepKa, stepKb); - singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); - (void)UpdateTiling(params, coreStatus, singleCoreStatus); - return true; -} - -bool MatmulTilingAlgorithm::NeedOutputAlign(int32_t m, int32_t n, int32_t k) const -{ - int32_t aTypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - int32_t bTypeSize = DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType); - int32_t cTypeSize = DTYPE_BIT_TAB.at(tilingIns_->cType_.dataType); - constexpr static int32_t outputRatio = 2; - bool needAlign = static_cast(n * m) * static_cast(outputRatio * cTypeSize) > - static_cast(n * k* aTypeSize) + static_cast(m * k * bTypeSize); - return needAlign; -} - -void MatmulTilingAlgorithm::CalcMultiCoreBlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, - CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes) -{ - auto factors = MathUtil::GetFactorPairs(numOfBlock_); - std::vector results; - for (const auto& factor : factors) { - results.push_back(CalcComputeIntensity(params, baseBlock, factor)); - } - // 排序结果 - std::sort(results.begin(), results.end()); - for (auto v : results) { - TILING_LOG_DEBUG("intent:%f, cycle: %f, band: %f, mDim: %d, nDim: %d\n", - v.avgIntensity, v.computeCycle, v.bandRatio, v.dimFactor.first, v.dimFactor.second); - } - coreStatus.batchDim = 1; - blockDimRes.nDimFactor = results[0].dimFactor.second; - blockDimRes.mDimFactor = results[0].dimFactor.first; - blockDimRes.kDimFactor = 1; - coreStatus.mDim = results[0].dimFactor.first; - coreStatus.nDim = results[0].dimFactor.second; - coreStatus.kDim = 1; - const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); - const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool needOutputAlign = NeedOutputAlign(m, n, GetSingleK()); - (void)AlignSingleShape((!tilingIns_->bType_.isTrans || needOutputAlign), n, coreStatus.nDim, bAlignSize, coreStatus.n); - (void)AlignSingleShape(tilingIns_->aType_.isTrans, m, coreStatus.mDim, aAlignSize, coreStatus.m); - blockDimRes.kNum = params.k32 / coreStatus.kDim * C0_SIZE * REDUCE_BLOCK_SIZE; // contain k * 16 - blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 - coreStatus.batch = params.batch32; - coreStatus.k = params.k32 / coreStatus.kDim; - TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d", coreStatus.m, coreStatus.n, coreStatus.k); - // load size of A matrix is batch * m - // load size of B matrix is n - DimFactor blockDim(1, blockDimRes.mDimFactor, blockDimRes.kDimFactor, blockDimRes.nDimFactor); - GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); - return; -} - -void MatmulTilingAlgorithm::UpdateMultiCore(const std::string& opType, const MatmulRunParas& params, - CoreStatusPack& coreStatus, const BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. - coreStatus.batchDim = min(MathUtil::CeilDivision(params.batch32, coreStatus.batch), numOfBlock_); - coreStatus.nDim = min(MathUtil::CeilDivision(params.n32, coreStatus.n), numOfBlock_); - coreStatus.mDim = min(MathUtil::CeilDivision(params.m32, coreStatus.m), numOfBlock_); - - if (tilingIns_->enableSplitK_) { - coreStatus.kDim = min(MathUtil::CeilDivision(params.k32, coreStatus.k), numOfBlock_); - } else { - coreStatus.kDim = blockDimRes.kDimFactor; - } - UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : TilingPolicy::NO_POLICY, - coreStatus); -} - -void MatmulTilingAlgorithm::UpdateBufferSize(const TilingPolicy policy, const CoreStatusPack& coreStatus) const -{ - if (policy == TilingPolicy::NO_POLICY) { - return; - } else if (policy == TilingPolicy::FIXED_B_TSCM) { - const int32_t bMatrixSize = - MathUtil::Align(coreStatus.n, MathUtil::CeilDivision(tilingIns_->alignSingleN, C0_SIZE)) * coreStatus.k * - C0_SIZE * C0_BYTE_SIZE * 2; - tilingIns_->bufferPool_.l1Size -= bMatrixSize; - } else if (policy == TilingPolicy::FIXED_A_TSCM) { - const int32_t aMatrixSize = coreStatus.m * coreStatus.k * C0_SIZE * C0_BYTE_SIZE * 2; - tilingIns_->bufferPool_.l1Size -= aMatrixSize; - } else { - return; - } -} - -bool MatmulTilingAlgorithm::IsInvalidFactor(int32_t factor) const -{ - return factor > numOfBlock_ || factor <= 0; -} - -void MatmulTilingAlgorithm::AddOptimalFactors(const std::string& opType, const MatmulRunParas& params, - BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - const int32_t coreNum = numOfBlock_; - // A/B fullload or A fullload + B Kdim fullload or B fullload + A Kdim fullload(1/2/4) - const int32_t mnCore = MathUtil::CeilDivision(coreNum, params.batch32); - if (mnCore > 1) { - const float optPoint = static_cast(sqrt((params.m32 + 0.0f) / params.n32 * mnCore)); - const int32_t mdim = static_cast(ceil(optPoint)); - const int32_t ndim = static_cast(ceil(mnCore / optPoint)); - MathUtil::AddFactor(blockDimRes.mDimFactors, mdim); - MathUtil::AddFactor(blockDimRes.mDimFactors, ndim == 0 ? 1 : mnCore / ndim); - MathUtil::AddFactor(blockDimRes.nDimFactors, ndim); - MathUtil::AddFactor(blockDimRes.nDimFactors, mdim == 0 ? 1 : mnCore / mdim); - } -} - -void MatmulTilingAlgorithm::GenBlockDimsMapFactors(const std::string& opType, MatmulRunParas& params, - BlockDimCalculator& blockDimRes) const -{ - const int32_t coreNum = numOfBlock_; - blockDimRes.batchDimFactors.reserve(coreNum); - blockDimRes.mDimFactors.reserve(coreNum); - blockDimRes.nDimFactors.reserve(coreNum); - blockDimRes.kDimFactors.reserve(coreNum); - MathUtil::GetBlockFactors(blockDimRes.batchDimFactors, params.batch32, params.batchMapped, coreNum, - min(coreNum, params.batch32)); - MathUtil::GetBlockFactors(blockDimRes.mDimFactors, params.m32, params.mMapped, coreNum, min(coreNum, params.m32)); - MathUtil::GetBlockFactors(blockDimRes.nDimFactors, params.n32, params.nMapped, coreNum, min(coreNum, params.n32)); - // first get kDim candidate - if (!tilingIns_->enableSplitK_) { - blockDimRes.kDimFactors.push_back(1); - params.kMapped = params.k32; - } else { - MathUtil::GetBlockFactors(blockDimRes.kDimFactors, params.k32, params.kMapped, coreNum, coreNum); - } - AddOptimalFactors(opType, params, blockDimRes); -} - -void MatmulTilingAlgorithm::GetBlockDim(const std::string& opType, MatmulRunParas& params, CoreStatusPack& coreStatus, - BlockDimCalculator& blockDimRes) -{ - // get batchDim, kDim, mDim and nDim for single core - // support multi cores slicing along kDim - // single core batchDim, mDim, nDim, kDim is a factor of input batch, m, n, k - // multi-core strategy for mini shape's is different from other situations and requires preprocess - if (PreProcessMiniShape(opType, coreStatus, params, numOfBlock_, tilingIns_->enableSplitK_)) { - // Due to the modification of data amount in single-core, the number of multi-core needs to be updated. - coreStatus.batchDim = MathUtil::CeilDivision(params.batch32, coreStatus.batch); - coreStatus.nDim = MathUtil::CeilDivision(params.n32, coreStatus.n); - coreStatus.mDim = MathUtil::CeilDivision(params.m32, coreStatus.m); - coreStatus.kDim = MathUtil::CeilDivision(params.k32, coreStatus.k); - UpdateBufferSize(tilingIns_->bType_.pos == TPosition::TSCM ? TilingPolicy::FIXED_B_TSCM : - TilingPolicy::NO_POLICY, - coreStatus); - splitCoreFlag_ = true; - return; - } - GenBlockDimsMapFactors(opType, params, blockDimRes); - for (const int32_t bFactor : blockDimRes.batchDimFactors) { - for (const int32_t nFactor : blockDimRes.nDimFactors) { - if (IsInvalidFactor(bFactor * nFactor)) { - continue; - } - for (const int32_t mFactor : blockDimRes.mDimFactors) { - if (IsInvalidFactor(bFactor * nFactor * mFactor)) { - continue; - } - for (const int32_t kFactor : blockDimRes.kDimFactors) { - if (IsInvalidFactor(bFactor * nFactor * mFactor * kFactor)) { - continue; - } - DimFactor blockDim(bFactor, mFactor, kFactor, nFactor); - GetBlockDimHelper(blockDim, coreStatus, blockDimRes, params); - } - } - } - } - - coreStatus.batch = MathUtil::CeilDivision(params.batch32, blockDimRes.batchDimFactor); - coreStatus.n = MathUtil::CeilDivision(params.n32, blockDimRes.nDimFactor); - coreStatus.m = MathUtil::CeilDivision(params.m32, blockDimRes.mDimFactor); - coreStatus.k = MathUtil::CeilDivision(params.k32, blockDimRes.kDimFactor); - if (g_tempCfg.factorSplit) { - const int32_t n = MathUtil::FindBestSingleCore(params.n32, params.nMapped, blockDimRes.nDimFactor, false); - const int32_t m = MathUtil::FindBestSingleCore(params.m32, params.mMapped, blockDimRes.mDimFactor, false); - const int32_t k = MathUtil::FindBestSingleCore(params.k32, params.kMapped, blockDimRes.kDimFactor, true); - const int32_t needCoreNum = static_cast(MathUtil::CeilDivision(params.batch32, coreStatus.batch) * - MathUtil::CeilDivision(params.n32, n) * - MathUtil::CeilDivision(params.m32, m) * - MathUtil::CeilDivision(params.k32, k)); - if (IsInvalidFactor(needCoreNum) == false) { - coreStatus.n = n; - coreStatus.m = m; - coreStatus.k = k; - } - } - - params.nonFactorK = params.k32 == params.kMapped ? false : true; - UpdateMultiCore(opType, params, coreStatus, blockDimRes); -} - -void MatmulTilingAlgorithm::NonFactorMap(const std::string& opType, MatmulRunParas& param, - BlockDimCalculator& blockDimRes) const -{ - (void)(opType); - param.batchMapped = param.batch32; - param.mMapped = param.m32; - param.kMapped = param.k32; - param.nMapped = param.n32; - // Split k will introduce atomic_add which can't be used with shift_inwards. - // Thus in split k mode, batch/m/n/ can't use non-factorial segmentation. - if (tilingIns_->enableSplitK_) { - // it is only necessary to consider the non-factor splitting of k when splitKFlag is true - int32_t kFactorLess64Cnt = 0; - int32_t kFactorLess1024Cnt = 0; - MathUtil::GetFactorCnt(param.k32, kFactorLess64Cnt, 1, L0_FACTOR_LIMIT); - MathUtil::GetFactorCnt(param.k32, kFactorLess1024Cnt, L0_FACTOR_LIMIT + 1, L1_FACTOR_LIMIT); - if ((param.k32 > L0_FACTOR_LIMIT && kFactorLess64Cnt <= L0_FACTOR_NUM_LIMIT) || - (param.k32 > L1_FACTOR_LIMIT && kFactorLess64Cnt + kFactorLess1024Cnt <= L1_FACTOR_NUM_LIMIT)) { - // Non-factors of the k dimension use a down-aligned number of powers of 2 - param.kMapped = MathUtil::MapShape(param.k32, false); - } - } else { - MathUtil::GetFactorCnt(param.batch32, blockDimRes.batchFactorCnt, 1, numOfBlock_); - if (param.batch32 > 1 && blockDimRes.batchFactorCnt <= L0_FACTOR_NUM_LIMIT) { - param.batchMapped = MathUtil::MapShape(param.batch32); - } - param.mMapped = MathUtil::MapShape(param.m32); - param.nMapped = MathUtil::MapShape(param.n32); - } -} - -void MatmulTilingAlgorithm::FillParam(MatmulRunParas& param) -{ - param.oriShapeM = tilingIns_->orgM; - param.oriShapeN = tilingIns_->orgN; - param.oriShapeKa = tilingIns_->orgKa; - param.oriShapeKb = tilingIns_->orgKb; - int32_t realM = 1; - int32_t realN = 1; - int32_t realK = 1; - - if (tilingIns_->singleCoreM != -1 || tilingIns_->singleCoreK != -1 || tilingIns_->singleCoreN != -1) { - realM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; - realK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; - realN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; - singelBlockDim_ = true; - numOfBlock_ = 1; - } else { - realM = GetSingleM(); - realK = GetSingleK(); - realN = GetSingleN(); - singelBlockDim_ = false; - numOfBlock_ = tilingIns_->blockDim; - } - - const int32_t reduceBlockSize = C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - param.k32 = MathUtil::CeilDivision(realK, reduceBlockSize); - param.m32 = MathUtil::CeilDivision(realM, C0_SIZE); - param.n32 = MathUtil::CeilDivision(realN, C0_SIZE); - param.mMapped = MathUtil::MapShape(param.m32, true); - param.kMapped = MathUtil::MapShape(param.k32, true); - param.nMapped = MathUtil::MapShape(param.n32, true); -} - -bool MatmulTilingAlgorithm::CheckFinaleParams(const CoreStatusPack& coreStatus) const -{ - (void)coreStatus; - const int32_t stepM = tilingIns_->tiling_.get_stepM(); - const int32_t stepN = tilingIns_->tiling_.get_stepN(); - const int32_t depthA1 = tilingIns_->tiling_.get_depthA1(); - const int32_t depthB1 = tilingIns_->tiling_.get_depthB1(); - - const int32_t l1Size = tilingIns_->tiling_.get_shareL1Size(); - const int32_t l0CSize = tilingIns_->tiling_.get_shareL0CSize(); - const int32_t uBSize = tilingIns_->tiling_.get_shareUbSize(); - - if (stepM == 0 || stepN == 0 || depthA1 == 0 || depthB1 == 0) { - TILING_LOG_WARNING("stepM/N depthA1/B1 should greate then zeros"); - return false; - } - - if (stepM > depthA1 || stepN > depthB1) { - TILING_LOG_WARNING("stepM/N should less then depthA1/B1"); - return false; - } - - if (l1Size > tilingIns_->bufferPool_.l1Size || l0CSize > tilingIns_->bufferPool_.l0CSize || - uBSize > tilingIns_->bufferPool_.ubSize) { - TILING_LOG_WARNING("L1/L0C/UB used size should less then L1Size/L0CSize/UbSize"); - return false; - } - - int dateDtypeSize = DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType); - int32_t biasL1Size = tilingIns_->isBias ? - tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE : 0; - if (!tilingIns_->isBMNKBmm && tilingIns_->tiling_.get_BatchNum() > 0 && - ((tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK() + - tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK()) * - tilingIns_->tiling_.get_BatchNum() * dateDtypeSize / BITS_PER_BYTE + biasL1Size > - tilingIns_->bufferPool_.l1Size)) { - TILING_LOG_WARNING("a/b matrix size of batch mm should less then L1Size"); - return false; - } - - return true; -} - -void MatmulTilingAlgorithm::CheckL0DB(SingleCoreStatus& singleCoreStatus, const int32_t baseK) const -{ - int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && - tilingIns_->aType_.scalePos == TPosition::TSCM) { - baseM = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; - } - if (tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && - tilingIns_->bType_.scalePos == TPosition::TSCM) { - baseN = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; - } - if (baseM * baseK > tilingIns_->bufferPool_.l0ASize / DB_ON) { - singleCoreStatus.l0Status.dbL0A = DB_OFF; - } - if (baseN * baseK > tilingIns_->bufferPool_.l0BSize / DB_ON) { - singleCoreStatus.l0Status.dbL0B = DB_OFF; - } - if (baseM * baseN > tilingIns_->bufferPool_.l0CSize / DB_ON) { - singleCoreStatus.l0Status.dbL0C = DB_OFF; - } -} - -void MatmulTilingAlgorithm::GetMxUsedL1Size(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, - int32_t& dataUsedL1Size, int32_t& scaleUsedL1Size, int32_t& biasUsedL1Size) const -{ - int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; - - int32_t depthA1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1; - int32_t depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; - dataUsedL1Size = depthA1 * baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE + - depthB1 * baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - // scale is fp8e8m0 - scaleUsedL1Size = depthA1 * baseM * baseK / SCALE_K_SIZE + - depthB1 * baseN * baseK / SCALE_K_SIZE; - // bias is fp32 - int32_t bias = tilingIns_->isBias ? 1 : 0; - biasUsedL1Size = bias * baseN * DTYPE_BIT_TAB.at(tilingIns_->biasType_.dataType) / BITS_PER_BYTE; -} - -void MatmulTilingAlgorithm::AdjustSparseL0Factors(SingleCoreStatus& singleCoreStatus) const -{ - // determine whether the scenario is sparse - if (!tilingIns_->isSparse_) { - TILING_LOG_DEBUG("Not sparse scenario does not need to adjust L0Factors."); - return; - } - - int32_t baseK = - singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - constexpr int32_t sparseBaseKFac = 64; // baseK need to align to 64 on Sparse - if (baseK <= sparseBaseKFac) { - baseK = sparseBaseKFac; - } else { - baseK = MathUtil::AlignDown(baseK, sparseBaseKFac); - } - singleCoreStatus.l0Status.kL0 = - baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - - // check L0A/L0B/L0Csize for L0 DB - CheckL0DB(singleCoreStatus, baseK); -} - -void MatmulTilingAlgorithm::AdjustMxL0Factors(SingleCoreStatus& singleCoreStatus) const -{ - // Determine wherther the scenario is MX. - if (tilingIns_->madType_ != MatrixMadType::MXMODE) { - return; - } - if (!tilingIns_->aType_.hasSetScaleType) { - tilingIns_->aType_.scalePos = tilingIns_->aType_.pos; - tilingIns_->aType_.scaleType = tilingIns_->aType_.type; - tilingIns_->aType_.isScaleTrans = tilingIns_->aType_.isTrans; - } - if (!tilingIns_->bType_.hasSetScaleType) { - tilingIns_->bType_.scalePos = tilingIns_->bType_.pos; - tilingIns_->bType_.scaleType = tilingIns_->bType_.type; - tilingIns_->bType_.isScaleTrans = tilingIns_->bType_.isTrans; - } - // In the NZ scenario, ensure that the base size of the inner axis is 64-aligned downwards. - constexpr int32_t l0Factor = INT4_ALIGN_SIZE / C0_SIZE; - if (tilingIns_->aType_.type == CubeFormat::NZ && tilingIns_->aType_.isTrans) { - if (singleCoreStatus.l0Status.mL0 > l0Factor) { - singleCoreStatus.l0Status.mL0 = singleCoreStatus.l0Status.mL0 / l0Factor * l0Factor; - } - } - if (tilingIns_->bType_.type == CubeFormat::NZ && !tilingIns_->bType_.isTrans) { - if (singleCoreStatus.l0Status.nL0 > l0Factor) { - singleCoreStatus.l0Status.nL0 = singleCoreStatus.l0Status.nL0 / l0Factor * l0Factor; - } - } - // FP8 baseK need must be 64 element aligned - int32_t baseK = - singleCoreStatus.l0Status.kL0 * (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - if ((tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E5M2 || - tilingIns_->aType_.dataType == DataType::DT_FLOAT8_E4M3FN) && - (tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E5M2 || - tilingIns_->bType_.dataType == DataType::DT_FLOAT8_E4M3FN)) { - baseK = baseK <= MX_BASEK_FACTOR ? MX_BASEK_FACTOR : MathUtil::AlignDown(baseK, MX_BASEK_FACTOR); - singleCoreStatus.l0Status.kL0 = - baseK / (C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); - } - bool mL0NeedAlign = tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && - tilingIns_->aType_.scalePos == TPosition::TSCM; - if (mL0NeedAlign) { - singleCoreStatus.l0Status.mL0 = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT); - } - bool nL0NeedAlign = tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && - tilingIns_->bType_.scalePos == TPosition::TSCM; - if (nL0NeedAlign) { - singleCoreStatus.l0Status.nL0 = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT); - } - // check L0A/L0B/L0CSize for L0DB - CheckL0DB(singleCoreStatus, baseK); -} - -void MatmulTilingAlgorithm::AdjustMxL1Factors(SingleCoreStatus& singleCoreStatus, const int32_t k0Size) const -{ - // determine whether the scenario is MX - if (tilingIns_->madType_ != MatrixMadType::MXMODE) { - return; - } - int32_t dataUsedL1Size = 0; - int32_t scaleUsedL1Size = 0; - int32_t biasUsedL1Size = 0; - GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); - // The existing tiling policy causes the L1 threshold to exceed the threshold. - // Adjust the tiling policy to the basic one. That is, only baseM * baseK + baseN * baseK is cached ai L1. - if (dataUsedL1Size + scaleUsedL1Size + biasUsedL1Size > tilingIns_->bufferPool_.l1Size) { - // checks whether the tiling is valid. - // If the tiling is invalid, the system uses the minimum tiling policy. - singleCoreStatus.l1Status.kAL1 = singleCoreStatus.l0Status.kL0; - singleCoreStatus.l1Status.kBL1 = singleCoreStatus.l0Status.kL0; - singleCoreStatus.l1Status.mAL1 = 1; - singleCoreStatus.l1Status.nBL1 = 1; - } -} - -void MatmulTilingAlgorithm::GetMxScaleFactor(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& mxTypePara) const -{ - // determine whether the scenario is MX - if (tilingIns_->madType_ != MatrixMadType::MXMODE) { - return; - } - int32_t dataUsedL1Size = 0; - int32_t scaleUsedL1Size = 0; - int32_t biasUsedL1Size = 0; - GetMxUsedL1Size(singleCoreStatus, k0Size, dataUsedL1Size, scaleUsedL1Size, biasUsedL1Size); - - uint8_t scaleFactorA = 1; - uint8_t scaleFactorB = 1; - int32_t remainedL1Size = tilingIns_->bufferPool_.l1Size - (dataUsedL1Size + biasUsedL1Size); - int32_t singleCoreK = tilingIns_->tiling_.get_singleCoreK(); - int32_t stepKa = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0); - int32_t stepKb = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0); - int32_t baseK = singleCoreStatus.l0Status.kL0 * k0Size; - int32_t kStep = MathUtil::CeilDivision(singleCoreK, baseK); - uint8_t maxScaleFactorA = static_cast(MathUtil::CeilDivision(kStep, stepKa)); - uint8_t maxScaleFactorB = static_cast(MathUtil::CeilDivision(kStep, stepKb)); - int32_t baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; - int32_t baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; - - // only support in K direction, scale DB same as data. - scaleFactorA = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKa * baseM * baseK / SCALE_K_SIZE)); - scaleFactorB = static_cast(remainedL1Size / MX_L1_BUFFER_NUM / (stepKb * baseN * baseK / SCALE_K_SIZE)); - scaleFactorA = scaleFactorA > maxScaleFactorA ? maxScaleFactorA : scaleFactorA; - scaleFactorB = scaleFactorB > maxScaleFactorB ? maxScaleFactorB : scaleFactorB; - - // scaleFactor is in range of [1, 127] - scaleFactorA = scaleFactorA >= static_cast(1) ? scaleFactorA : static_cast(1); - scaleFactorB = scaleFactorB >= static_cast(1) ? scaleFactorB : static_cast(1); - scaleFactorA = scaleFactorA <= SCALE_FACTOR_MAX_VALUE ? scaleFactorA : SCALE_FACTOR_MAX_VALUE; - scaleFactorB = scaleFactorB <= SCALE_FACTOR_MAX_VALUE ? scaleFactorB : SCALE_FACTOR_MAX_VALUE; - - // 8bit: 0~6bit:scaleFactor, 7bit(reserved):double buffer flag - scaleFactorA = scaleFactorA & static_cast(0x7f); - scaleFactorB = scaleFactorB & static_cast(0x7F); - mxTypePara = static_cast(static_cast(mxTypePara) | scaleFactorA); - mxTypePara = static_cast(static_cast(mxTypePara) | static_cast(scaleFactorB << 8U)); -} - -void MatmulTilingAlgorithm::PreprocessL0DB() -{ - dbL0A_ = g_tempCfg.l0aDB; - dbL0B_ = g_tempCfg.l0bDB; - dbL0C_ = g_tempCfg.l0cDB; - if (tilingIns_->baseM != -1) { - const int32_t baseLeftSize = tilingIns_->baseM * C0_BYTE_SIZE; - if (baseLeftSize > tilingIns_->bufferPool_.l0ASize / DB_ON) { - dbL0A_ = DB_OFF; - } - } - if (tilingIns_->baseN != -1) { - const int32_t baseRightSize = tilingIns_->baseN * C0_BYTE_SIZE; - if (baseRightSize > tilingIns_->bufferPool_.l0BSize / DB_ON) { - dbL0B_ = DB_OFF; - } - } - if (tilingIns_->baseM != -1 && tilingIns_->baseN != -1) { - const int32_t baseMatrixSize = tilingIns_->baseM * tilingIns_->baseN * C0_BYTE_SIZE; - if (baseMatrixSize > tilingIns_->bufferPool_.l0CSize / DB_ON) { - dbL0C_ = DB_OFF; - } - } - return; -} - -void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int32_t &b1LengthCache) const -{ - if (!tilingIns_->enableL1CacheUB || - tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND310P) { - return; - } - int32_t a1Length = tilingIns_->tiling_.get_baseM() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; - int32_t b1Length = tilingIns_->tiling_.get_baseN() * tilingIns_->tiling_.get_baseK() * - DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; - a1LengthCache = a1Length * tilingIns_->tiling_.get_stepKa() * tilingIns_->tiling_.get_stepM(); - b1LengthCache = b1Length * tilingIns_->tiling_.get_stepKb() * tilingIns_->tiling_.get_stepN(); - int32_t freeL1Size = tilingIns_->bufferPool_.l1Size - tilingIns_->tiling_.get_depthA1() * a1Length - - tilingIns_->tiling_.get_depthB1() * b1Length; - if (freeL1Size <= 0) { - return; - } - const int32_t splitNum = 2; - int32_t aOrgShapeSize = tilingIns_->tiling_.get_singleCoreM() * tilingIns_->tiling_.get_singleCoreK(); - int32_t bOrgShapeSize = tilingIns_->tiling_.get_singleCoreN() * tilingIns_->tiling_.get_singleCoreK(); - - if ((tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) && - (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM)) { - bool aFullLoad = false; - bool bFullLoad = false; - aFullLoad = aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size / splitNum; - bFullLoad = bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size / splitNum; - if (aFullLoad && bFullLoad) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - tilingIns_->tiling_.set_depthBL1CacheUB(1); - a1LengthCache = aOrgShapeSize; // update - b1LengthCache = bOrgShapeSize; - } else if (aFullLoad) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - a1LengthCache = aOrgShapeSize; - int32_t depthL1CacheUB = b1LengthCache > 0 ? (freeL1Size - aOrgShapeSize) / b1LengthCache : 0; - tilingIns_->tiling_.set_depthBL1CacheUB(depthL1CacheUB); - } else if (bFullLoad) { - tilingIns_->tiling_.set_depthBL1CacheUB(1); - b1LengthCache = bOrgShapeSize; - int32_t depthL1CacheUB = a1LengthCache > 0 ? (freeL1Size - bOrgShapeSize) / a1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthL1CacheUB); - } else { - if (a1LengthCache > freeL1Size) { - int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / b1LengthCache : 0; - tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); - } else if (b1LengthCache > freeL1Size) { - int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / a1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); - } else if (a1LengthCache <= freeL1Size / splitNum && b1LengthCache <= freeL1Size / splitNum) { - int32_t depthAL1CacheUB = a1LengthCache > 0 ? freeL1Size / splitNum / a1LengthCache : 0; - int32_t depthBL1CacheUB = b1LengthCache > 0 ? freeL1Size / splitNum / b1LengthCache : 0; - tilingIns_->tiling_.set_depthAL1CacheUB(depthAL1CacheUB); - tilingIns_->tiling_.set_depthBL1CacheUB(depthBL1CacheUB); - } else { - // can only cache one matrix - if (a1LengthCache <= b1LengthCache) { - tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); - } else { - tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); - } - } - } - } else if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.pos != TPosition::TSCM) { - if (aOrgShapeSize > 0 && aOrgShapeSize < freeL1Size) { - tilingIns_->tiling_.set_depthAL1CacheUB(1); - a1LengthCache = aOrgShapeSize; - } else if (a1LengthCache > 0) { - tilingIns_->tiling_.set_depthAL1CacheUB(freeL1Size / a1LengthCache); - } - } else if (tilingIns_->bType_.type == CubeFormat::ND && tilingIns_->bType_.pos != TPosition::TSCM) { - if (bOrgShapeSize > 0 && bOrgShapeSize < freeL1Size) { - tilingIns_->tiling_.set_depthBL1CacheUB(1); - b1LengthCache = bOrgShapeSize; - } else if (b1LengthCache > 0) { - tilingIns_->tiling_.set_depthBL1CacheUB(freeL1Size / b1LengthCache); - } - } else { - return; - } -} - -int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const -{ - int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; - // only bType is f32 need update - if (tilingIns_->bType_.dataType != DataType::DT_FLOAT - || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { - return depthB1; - } - uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; - uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; - - uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; - if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { - alignedBaseKM = alignedBaseK * tilingIns_->baseM; - } - // if L1 size is overflow, decrease depthB1 - if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) - > static_cast(tilingIns_->bufferPool_.l1Size)) { - depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; - depthB1 = depthB1 < 1 ? 1 : depthB1; - } - return depthB1; -} -int32_t MatmulTilingAlgorithm::GetSingleM() const -{ - return tilingIns_->singleM != -1 ? tilingIns_->singleM : tilingIns_->orgM; -} -int32_t MatmulTilingAlgorithm::GetSingleN() const -{ - return tilingIns_->singleN != -1 ? tilingIns_->singleN : tilingIns_->orgN; -} -int32_t MatmulTilingAlgorithm::GetSingleK() const -{ - return tilingIns_->singleK != -1 ? tilingIns_->singleK : tilingIns_->orgKa; -} -void MatmulTilingAlgorithm::GetSingleShape(const CoreStatusPack &coreStatus, const MatmulRunParas ¶m, - int32_t &singleCoreM, int32_t &singleCoreN, int32_t &singleCoreK) const -{ - singleCoreM = GetSingleM(); - singleCoreM = MathUtil::CeilDivision(singleCoreM, coreStatus.mDim); - singleCoreN = GetSingleN(); - singleCoreN = MathUtil::CeilDivision(singleCoreN, coreStatus.nDim); - singleCoreK = GetSingleK(); - singleCoreK = MathUtil::CeilDivision(singleCoreK, coreStatus.kDim); - if (singelBlockDim_) { - singleCoreM = tilingIns_->singleCoreM != -1 ? tilingIns_->singleCoreM : tilingIns_->singleM; - singleCoreN = tilingIns_->singleCoreN != -1 ? tilingIns_->singleCoreN : tilingIns_->singleN; - singleCoreK = tilingIns_->singleCoreK != -1 ? tilingIns_->singleCoreK : tilingIns_->singleK; - } - if (numOfBlock_ > 1) { - int32_t aAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE; - int32_t bAlignSize = DATA_COPY_ALIGN_SIZE / DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) * BITS_PER_BYTE; - bool needAlign = GetMultiCoreScenario(param) == MultiCoreScenario::SPLIT_MN; - bool needOutputAlign = NeedOutputAlign(singleCoreM, singleCoreN, singleCoreK); - (void)AlignSingleShape(needAlign && (!tilingIns_->bType_.isTrans || needOutputAlign), param.n32 * C0_SIZE, coreStatus.nDim, - bAlignSize, singleCoreN); - (void)AlignSingleShape(needAlign && tilingIns_->aType_.isTrans, param.m32 * C0_SIZE, coreStatus.mDim, - aAlignSize, singleCoreM); - if (tilingIns_->enableSplitK_) { - if (tilingIns_->aType_.dataType == DataType::DT_FLOAT || - tilingIns_->bType_.dataType == DataType::DT_FLOAT) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * FLOAT32_REDUCE_BLOCK_SIZE; - } else if ((tilingIns_->aType_.dataType == DataType::DT_INT8 || - tilingIns_->bType_.dataType == DataType::DT_INT8)) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT8_REDUCE_BLOCK_SIZE; - } else if ((tilingIns_->aType_.dataType == DataType::DT_INT4 || - tilingIns_->bType_.dataType == DataType::DT_INT4)) { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * INT4_REDUCE_BLOCK_SIZE; - } else { - singleCoreK = MathUtil::CeilDivision(param.k32, coreStatus.kDim) * REDUCE_BLOCK_SIZE; - } - } - } -} - -bool MatmulTilingAlgorithm::CheckSingleShape(int32_t singleCoreM, int32_t singleCoreN, int32_t singleCoreK) const -{ - (void)singleCoreM; - (void)singleCoreK; - if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || - tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad - if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && - (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { - TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " - "aligned, matrix c not support ND format"); - return false; - } - } - return true; -} - -int64_t MatmulTilingAlgorithm::Process() -{ - PreprocessL0DB(); - if (!CheckBaseMN()) { - TILING_LOG_WARNING("check baseM/baseN not pass"); - return -1; - } - singelBlockDim_ = false; - splitCoreFlag_ = false; - CoreStatusPack coreStatus; - SingleCoreStatus singleCoreStatus; - MatmulRunParas param; - BlockDimCalculator blockDimRes; - FillParam(param); - - std::string opType = "MatMul"; - if (numOfBlock_ != 1) { - NonFactorMap(opType, param, blockDimRes); - if (DoMultiCoreSplitMNTiling(param, coreStatus, blockDimRes)) { - return 0; - } - GetBlockDim(opType, param, coreStatus, blockDimRes); - } else { - if (!g_tempCfg.factorSplit) { - coreStatus.m = param.m32; - coreStatus.k = param.k32; - coreStatus.n = param.n32; - } else { - coreStatus.m = MathUtil::FindBestSingleCore(param.m32, param.mMapped, 1, false); - coreStatus.k = MathUtil::FindBestSingleCore(param.k32, param.kMapped, 1, false); - coreStatus.n = MathUtil::FindBestSingleCore(param.n32, param.nMapped, 1, false); - } - coreStatus.batchDim = 1; - coreStatus.mDim = 1; - coreStatus.kDim = 1; - coreStatus.nDim = 1; - } - - if (numOfBlock_ != 1 && tilingIns_->bType_.pos == TPosition::TSCM) { - if (!splitCoreFlag_) { - TILING_LOG_WARNING("Multi core split B TSCM full loaded is not sucess."); - return 1; - } - } - // single-core logic - GetL0Factors(opType, param, coreStatus, singleCoreStatus); - AdjustSparseL0Factors(singleCoreStatus); - AdjustMxL0Factors(singleCoreStatus); - if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || - singleCoreStatus.l0Status.kL0 == 0) { - TILING_LOG_WARNING("ml0/nl0/kl0 is zero"); - return -1; - } - GetL1Factors(opType, param, coreStatus, singleCoreStatus.l0Status, singleCoreStatus.l1Status); - if (UpdateTiling(param, coreStatus, singleCoreStatus) == -1L) { - return -1L; - } - const bool ans = CheckFinaleParams(coreStatus); - return ans ? 0 : -1; -} -} // namespace matmul_tiling \ No newline at end of file -- Gitee From 7c2d53481107641abbcc6227b2d3b0fc1b3bb1ad Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 09:10:27 +0000 Subject: [PATCH 52/56] add Signed-off-by: jiangchengcheng-on --- lib/matmul/matmul_client_new.h | 994 +++++++++++++++++++++++++++++++++ 1 file changed, 994 insertions(+) create mode 100644 lib/matmul/matmul_client_new.h diff --git a/lib/matmul/matmul_client_new.h b/lib/matmul/matmul_client_new.h new file mode 100644 index 00000000..98a0a5c0 --- /dev/null +++ b/lib/matmul/matmul_client_new.h @@ -0,0 +1,994 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_client.h + * \brief + */ +#ifndef LIB_MATMUL_MATMUL_CLIENT_H +#define LIB_MATMUL_MATMUL_CLIENT_H + +#include "kernel_operator.h" +#include "lib/matmul/constant_tiling.h" +#include "lib/matmul/tiling.h" +#include "../../impl/matmul/policy/matmul_policy.h" +#include "../../impl/matmul/utils/matmul_call_back.h" +#include "../../impl/matmul/utils/matmul_module.h" +#include "../../impl/matmul/utils/matmul_utils.h" +#if ASCENDC_CPU_DEBUG +#include "../../impl/matmul/kfc/matmul_server_aux.h" +#endif + +namespace AscendC { + +constexpr int32_t VECTOR_QUANT_MODE = 2; +constexpr int32_t NUM_EIGHT = int32_t(8); +constexpr uint16_t NUM_SIXTEEN = uint16_t(16); +constexpr uint16_t NUM_THIRTYTWO = uint16_t(32); +constexpr uint16_t NUM_FORTYEIGHT = uint16_t(48); + +// Service function of the Matmul on the AIV client side, which is the unit for sending messages. +template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> +class MatmulClientBase { + using SrcAT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + +public: +#if defined(__DAV_C310__) + __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.Init(cubeTiling, tpipe); + } +#endif + return; + } + ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); + ASSERT(cubeTiling != nullptr && "tiling cannot be nullptr when init matmul client"); + ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0); + // copy tiling to the last mem in ssbuf + MSG_POS TilingInfo *tilingSSbuf = reinterpret_cast(GetTilingAddr(GetSubBlockIdxImpl())); + while (tilingSSbuf->valid) { + } + tilingSSbuf->valid = 1; + auto tempTilingSSbuf = reinterpret_cast(&(tilingSSbuf->tCubeTiling)); + auto tempTiling = reinterpret_cast(const_cast(cubeTiling)); + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint64_t); ++i, ++tempTilingSSbuf, ++tempTiling) { + *tempTilingSSbuf = *tempTiling; + } + this->cubeTiling.SetTiling(cubeTiling); + PostMessage(); + nIter_ = ConstCeil(this->cubeTiling.GetSingleCoreN(), this->cubeTiling.GetBaseN()); + mIter_ = ConstCeil(this->cubeTiling.GetSingleCoreM(), this->cubeTiling.GetBaseM()); + if constexpr (ToMatmulConfig(MM_CFG).isPartialOutput) { + uint32_t kIter = ConstCeil(this->cubeTiling.GetSingleCoreK(), this->cubeTiling.GetBaseK()); + mnIter_ = nIter_ * mIter_ * kIter; + } else { + mnIter_ = nIter_ * mIter_; + } + cacheWorkspaceAddr = nullptr; + singleCoreM_ = this->cubeTiling.GetSingleCoreM(); + singleCoreN_ = this->cubeTiling.GetSingleCoreN(); + singleCoreK_ = this->cubeTiling.GetSingleCoreK(); + } +#else + __aicore__ inline void Init(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe = nullptr) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.Init(cubeTiling, tpipe); + } +#endif + return; + } + ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); + ASSERT(cubeTiling != nullptr && "cubeTiling cannot be nullptr when init matmul client"); + ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + return; + } + } + constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; + int32_t ubAddr = -1; + GM_ADDR tilingGM = client->AllocUB(tCubeTilingSize, ubAddr); + auto tempTilingGM = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + auto tempTiling = reinterpret_cast(const_cast (cubeTiling)); + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); ++i, ++tempTilingGM, ++tempTiling) { + *tempTilingGM = *tempTiling; + } + this->cubeTiling.SetTiling(cubeTiling); + GlobalTensor global; + for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { + Barrier(); + global.SetGlobalBuffer((__gm__ int64_t*)(tilingGM + i)); + DataCacheCleanAndInvalid(global); + } + Barrier(); + + auto msg = client->AllocMessage(); + client->ubMsg->tilingInfo.tilingAddr = tilingGM; + client->ubMsg->head = KfcMsgMakeFlag(KFC_Enum::MMFUN_INIT, this->instIdx); + client->ubMsg->ubAddr = ubAddr; + client->PostMessage(msg); // Initialize the local client after the expected processing is complete. + + *((uint64_t*)&kfcMsg_) = 0; + *((uint64_t*)&(kfcMsg_.body)) = 0; + nIter_ = ConstCeil(this->cubeTiling.GetSingleCoreN(), this->cubeTiling.GetBaseN()); + mIter_ = ConstCeil(this->cubeTiling.GetSingleCoreM(), this->cubeTiling.GetBaseM()); + mnIter_ = nIter_ * mIter_; + cacheWorkspaceAddr = nullptr; + } +#endif + + template __aicore__ inline void SetWorkspace(GlobalTensor& addr) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "SetWorkspace not support when enableMixDualMaster is enabled"); + ASSERT(addr.GetSize() > 0); + SetWorkspace(addr.GetPhyAddr(), addr.GetSize() * sizeof(T)); + } + template __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "SetWorkspace not support when enableMixDualMaster is enabled"); + ASSERT(addr != nullptr); + if constexpr (ToMatmulConfig(MM_CFG).singleCoreM == 0) { + ASSERT(!this->cubeTiling.IsNull()); + } + + cacheWorkspaceAddr = reinterpret_cast(const_cast<__gm__ T*>(addr)); + cOffset_ = 0; + } + + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetOrgShape(orgM, orgN, orgK); + } +#endif + return; + } + SetOrgShape(orgM, orgN, orgK, orgK, orgN); + } + + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetOrgShape(orgM, orgN, orgKa, orgKb, orgKc); + } +#endif + return; + } +#if defined(__DAV_C310__) + kfcMsg_.body.orgM = orgM; + kfcMsg_.body.orgN = orgN; + kfcMsg_.body.orgKa = orgKa; + kfcMsg_.body.orgKb = orgKb; + kfcMsg_.body.orgKc = orgKc; + kfcMsg_.body.setOrgShape = 1; +#else + kfcMsg_.orgShape.orgM = orgM; + kfcMsg_.orgShape.orgN = orgN; + kfcMsg_.orgShape.orgKa = orgKa; + kfcMsg_.orgShape.orgKb = orgKb; + kfcMsg_.orgShape.orgKc = orgKc; + PostMessage(); +#endif + } + + __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetSingleShape(singleM, singleN, singleK); + } +#endif + return; + } + SetTail(singleM, singleN, singleK); + } + + __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetTail(tailM, tailN, tailK); + } +#endif + return; + } + if (tailM != -1) { + mIter_ = ConstCeil(tailM, cubeTiling.GetBaseM()); + } + if (tailN != -1) { + nIter_ = ConstCeil(tailN, cubeTiling.GetBaseN()); + } + mnIter_ = nIter_ * mIter_; +#if defined(__DAV_C310__) + singleCoreM_ = tailM != -1 ? tailM : singleCoreM_; + singleCoreN_ = tailN != -1 ? tailN : singleCoreN_; + singleCoreK_ = tailK != -1 ? tailK : singleCoreK_; + if constexpr (ToMatmulConfig(MM_CFG).isPartialOutput) { + uint32_t kIter = ConstCeil(cubeTiling.GetSingleCoreK(), cubeTiling.GetBaseK()); + mnIter_ = nIter_ * mIter_ * kIter; + } +#endif + kfcMsg_.body.singleM = tailM; + kfcMsg_.body.singleN = tailN; + kfcMsg_.body.singleK = tailK; + kfcMsg_.body.setTail = 1; + } + + // transMode only support 0 or 1 + // 0: round mode is round to the nearest tie to even + // 1: round mode is round to the nearest tie away from zero + __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetHF32(enableHF32, transMode); + } +#endif + return; + } + kfcMsg_.body.enHF32 = enableHF32; + kfcMsg_.body.hf32TransMode = transMode; + + PostMessage(); + } + +#if defined(__DAV_C310__) + __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTransposeA = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorA localTensor not support when enableMixDualMaster is enabled"); + ASSERT(isTransposeA <= A_TYPE::isTrans && + "It is not allowed to do A transpose when matmul A transpose is not defined."); + kfcMsg_.body.isTransA = static_cast(isTransposeA); + kfcMsg_.body.setTensorA = 1; + kfcMsg_.body.isFirstIter = 1; + if constexpr (A_TYPE::pos == TPosition::TSCM) { + auto tmpAddr = GetTscmAddr(leftMatrix); + auto intraId = (reinterpret_cast(leftMatrix.GetBufferHandle()))->enQueEvtID; + // 8 bit for intraID, 32 bit for addr + kfcMsg_.body.aAddr = (((uint64_t)intraId) << VALID_ADDR_BITS_NUM) + tmpAddr; + sizeAmatrix_ = leftMatrix.GetSize() * sizeof(SrcAT); + } else { + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(GetSubBlockIdxImpl(), this->instIdx); + while (!(matmulL1AddrMsg->valid)) { + } + uint64_t aL1Addr = matmulL1AddrMsg->l1aAddr; + kfcMsg_.body.aAddr = aL1Addr; + sizeAmatrix_ = leftMatrix.GetSize() * sizeof(SrcAT); + aAddr_ = (uint64_t)leftMatrix.GetPhyAddr(); + } + } +#else + __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTransposeA = false) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if (__CCE_AICORE__ == 220) + ASSERT("SetTensorA localTensor not support when enableMixDualMaster is enabled"); +#endif + return; + } + ASSERT(isTransposeA <= A_TYPE::isTrans && + "It is not allowed to do A transpose when matmul A transpose is not defined."); + kfcMsg_.body.isTransA = static_cast(isTransposeA); + kfcMsg_.body.setTensorA = 1; + kfcMsg_.body.isFirstIter = 1; + if constexpr (A_TYPE::pos == TPosition::TSCM) { + kfcMsg_.body.aAddr = GetTscmAddr(leftMatrix); + kfcMsg_.body.sizeAmatrix = leftMatrix.GetSize() * sizeof(SrcAT); + } else { + kfcMsg_.body.aAddr = GetGlobalAddr(leftMatrix); + kfcMsg_.body.sizeAmatrix = leftMatrix.GetSize() * sizeof(SrcAT); + } + } +#endif + + __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, + bool isTransposeA = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "SetTensorAWithCopy not support when enableMixDualMaster is enabled"); + ASSERT(A_TYPE::pos != TPosition::TSCM); + kfcMsg_.body.isTransA = static_cast(isTransposeA); + kfcMsg_.body.setTensorA = 1; + kfcMsg_.body.isFirstIter = 1; +#if defined(__DAV_C310__) + // C220 using RTS to control cahce mode, C310 using hardware instructions to control. + kfcMsg_.body.aAddr = GetGMAddrAndCopyUB(gm.address_, leftMatrix); // cache mode switch hide in address + sizeAmatrix_ = leftMatrix.GetSize() * sizeof(SrcAT); +#else + kfcMsg_.body.aAddr = GetGMAddrAndCopyUB(gm.GetPhyAddr(), leftMatrix); + kfcMsg_.body.sizeAmatrix = leftMatrix.GetSize() * sizeof(SrcAT); +#endif + } + +#if defined(__DAV_C310__) + __aicore__ inline void SetTensorB(const LocalTensor& rightMatrix, bool isTransposeB = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorB localTensor not support when enableMixDualMaster is enabled"); + ASSERT(isTransposeB <= B_TYPE::isTrans && + "It is not allowed to do B transpose when matmul B transpose is not defined."); + kfcMsg_.body.isTransB = static_cast(isTransposeB); + kfcMsg_.body.setTensorB = 1; + kfcMsg_.body.isFirstIter = 1; + + if constexpr (B_TYPE::pos == TPosition::TSCM) { + auto tmpAddr = GetTscmAddr(rightMatrix); + auto intraId = (reinterpret_cast(rightMatrix.GetBufferHandle()))->enQueEvtID; + kfcMsg_.body.bAddr = (((uint64_t)intraId) << VALID_ADDR_BITS_NUM) + tmpAddr; + sizeBmatrix_ = rightMatrix.GetSize() * sizeof(SrcBT); + } else { + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(GetSubBlockIdxImpl(), this->instIdx); + while (!(matmulL1AddrMsg->valid)) { + } + uint64_t bL1Addr = matmulL1AddrMsg->l1bAddr; + kfcMsg_.body.bAddr = bL1Addr; + sizeBmatrix_ = rightMatrix.GetSize() * sizeof(SrcBT); + bAddr_ = (uint64_t)rightMatrix.GetPhyAddr(); + } + } +#else + __aicore__ inline void SetTensorB(const LocalTensor& rightMatrix, bool isTransposeB = false) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if (__CCE_AICORE__ == 220) + ASSERT("SetTensorB localTensor not support when enableMixDualMaster is enabled"); +#endif + return; + } + ASSERT(isTransposeB <= B_TYPE::isTrans && + "It is not allowed to do B transpose when matmul B transpose is not defined."); + kfcMsg_.body.isTransB = static_cast(isTransposeB); + kfcMsg_.body.setTensorB = 1; + kfcMsg_.body.isFirstIter = 1; + + if constexpr (B_TYPE::pos == TPosition::TSCM) { + kfcMsg_.body.bAddr = GetTscmAddr(rightMatrix); + kfcMsg_.body.sizeBmatrix = rightMatrix.GetSize() * sizeof(SrcBT); + } else { + kfcMsg_.body.bAddr = GetGlobalAddr(rightMatrix); + kfcMsg_.body.sizeBmatrix = rightMatrix.GetSize() * sizeof(SrcBT); + } + } +#endif + + __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& rightMatrix, + bool isTransposeB = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "SetTensorBWithCopy not support when enableMixDualMaster is enabled"); + ASSERT(A_TYPE::pos != TPosition::TSCM); + kfcMsg_.body.isTransB = static_cast(isTransposeB); + kfcMsg_.body.setTensorB = 1; + kfcMsg_.body.isFirstIter = 1; +#if defined(__DAV_C310__) + kfcMsg_.body.bAddr = GetGMAddrAndCopyUB(gm.address_, rightMatrix); + sizeBmatrix_ = rightMatrix.GetSize() * sizeof(SrcBT); +#else + kfcMsg_.body.bAddr = GetGMAddrAndCopyUB(gm.GetPhyAddr(), rightMatrix); + kfcMsg_.body.sizeBmatrix = rightMatrix.GetSize() * sizeof(SrcBT); +#endif + } + +#if defined(__DAV_C310__) + __aicore__ inline void SetBias(const LocalTensor& inputBias) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetBias localTensor not support when enableMixDualMaster is enabled"); + kfcMsg_.body.setTensorBias = 1; + kfcMsg_.body.isFirstIter = 1; + if constexpr (BIAS_TYPE::pos == TPosition::TSCM) { + kfcMsg_.body.biasAddr = GetTscmAddr(inputBias); + sizeBiasmatrix_ = inputBias.GetSize() * sizeof(BiasT); + } else { + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(GetSubBlockIdxImpl(), this->instIdx); + while (!(matmulL1AddrMsg->valid)) { + } + kfcMsg_.body.biasAddr = matmulL1AddrMsg->l1biasAddr; + biasAddr_ = (uint64_t)inputBias.GetPhyAddr(); + sizeBiasmatrix_ = inputBias.GetSize() * sizeof(BiasT); + } + }; +#else + __aicore__ inline void SetBias(const LocalTensor& inputBias) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if (__CCE_AICORE__ == 220) + ASSERT("SetBias localTensor not support when enableMixDualMaster is enabled"); +#endif + return; + } + kfcMsg_.body.setTensorBias = 1; + if constexpr (BIAS_TYPE::pos == TPosition::TSCM) { + kfcMsg_.body.biasAddr = GetTscmAddr(inputBias); + } else { + kfcMsg_.body.biasAddr = GetGlobalAddr(inputBias); + } + }; +#endif + + __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetTensorA(gm, isTransposeA); + } +#endif + return; + } +#if defined(__DAV_C310__) + static_assert((GetPhyType(A_TYPE::pos) == Hardware::GM), + "SetTensorA GlobalTensor not support when A_TYPE position is not GM"); +#endif + ASSERT(isTransposeA <= A_TYPE::isTrans && + "It is not allowed to do A transpose when matmul A transpose is not defined."); + kfcMsg_.body.isTransA = static_cast(isTransposeA); + kfcMsg_.body.setTensorA = 1; + kfcMsg_.body.isFirstIter = 1; +#if defined(__DAV_C310__) + kfcMsg_.body.aAddr = reinterpret_cast(gm.address_); + sizeAmatrix_ = gm.GetSize() * sizeof(SrcAT); +#else + kfcMsg_.body.aAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.sizeAmatrix = gm.GetSize() * sizeof(SrcAT); +#endif + } + + __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetTensorB(gm, isTransposeB); + } +#endif + return; + } +#if defined(__DAV_C310__) + static_assert((GetPhyType(B_TYPE::pos) == Hardware::GM), + "SetTensorB GlobalTensor not support when B_TYPE position is not GM"); +#endif + ASSERT(isTransposeB <= B_TYPE::isTrans && + "It is not allowed to do B transpose when matmul B transpose is not defined."); + kfcMsg_.body.isTransB = static_cast(isTransposeB); + kfcMsg_.body.setTensorB = 1; + kfcMsg_.body.isFirstIter = 1; +#if defined(__DAV_C310__) + kfcMsg_.body.bAddr = reinterpret_cast(gm.address_); + sizeBmatrix_ = gm.GetSize() * sizeof(SrcBT); +#else + kfcMsg_.body.bAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.sizeBmatrix = gm.GetSize() * sizeof(SrcBT); +#endif + } + +#if defined(__DAV_C310__) + template + __aicore__ inline void SetSelfDefineData(T dataPtr) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetSelfDefineData(dataPtr); + } +#endif + return; + } + ASSERT(sizeof(T) % 4 == 0); + uint32_t *ptr = reinterpret_cast(&dataPtr); + if constexpr (sizeof(T) == 4) { + kfcMsg_.userCustomData = *ptr; + kfcMsg_.body.userInfoType = 1; + } else if constexpr (sizeof(T) == 8) { + kfcMsg_.userCustomData = (*ptr); + kfcMsg_.body.userCustomData = *(ptr + 1); + kfcMsg_.body.userInfoType = 1; + } else { + // send msg + uint32_t *ptrMsg = reinterpret_cast(&(kfcMsg_.body)); + for (int i = 0; i < sizeof(T) / sizeof(uint32_t); i++) { + *(ptrMsg + i) = *(ptr + i); + } + PostMessage(); + } + } +#else + __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetSelfDefineData(dataPtr); + } +#endif + return; + } + kfcMsg_.body.dataPtr = dataPtr; + } +#endif + + __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetUserDefInfo(tilingPtr); + } +#endif + return; + } + kfcMsg_.userDefInfo.tilingPtr = tilingPtr; +#if defined(__DAV_C310__) + kfcMsg_.userCustomData = 1; +#endif + PostMessage(); + } + + __aicore__ inline void SetSparseIndex(const GlobalTensor& indexGlobal) + { + ASSERT("SetSparseIndex is not supported in matmul client."); + return; + } + + __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetQuantScalar(quantScalar); + } +#endif + return; + } + kfcMsg_.body.setQuant = 1; + kfcMsg_.body.quantMode = 1; + kfcMsg_.body.quantScalar = quantScalar; + } + + __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetQuantVector(quantTensor); + } +#endif + return; + } + kfcMsg_.body.setQuant = 1; + kfcMsg_.body.quantMode = VECTOR_QUANT_MODE; + kfcMsg_.body.quantAddr = reinterpret_cast(quantTensor.GetPhyAddr()); + kfcMsg_.body.quantSize = quantTensor.GetSize() * sizeof(uint64_t); + } + + __aicore__ inline void SetBias(const GlobalTensor& biasGlobal) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetBias(biasGlobal); + } +#endif + return; + } + kfcMsg_.body.biasAddr = reinterpret_cast(biasGlobal.GetPhyAddr()); + kfcMsg_.body.setTensorBias = 1; + } + + __aicore__ inline void SetTensorA(SrcAT aScalar) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetTensorA(aScalar); + } +#endif + return; + } + auto temp1 = (uint8_t*)&(aScalar); + auto temp2 = reinterpret_cast(&(kfcMsg_.body.aAddr)); + + for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + kfcMsg_.body.setTensorA = 1; + } + + __aicore__ inline void SetTensorB(SrcBT bScalar) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.SetTensorB(bScalar); + } +#endif + return; + } + auto temp1 = (uint8_t*)&(bScalar); + auto temp2 = reinterpret_cast(&(kfcMsg_.body.aAddr)); + + for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + kfcMsg_.body.setTensorB = 1; + } + + __aicore__ inline void DisableBias() + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.DisableBias(); + } +#endif + return; + } + kfcMsg_.body.setTensorBias = 0; + } + + __aicore__ inline void ClearBias() + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.ClearBias(); + } +#endif + return; + } + DisableBias(); + } + + __aicore__ inline void End() + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { + return; + } + if (isSyncGetC) { + PostMessage(); + } + } + + template __aicore__ inline bool Iterate(bool enPartialSum, + const LocalTensor& localCmatrix) + { + return false; + } + + template __aicore__ inline bool Iterate(bool enPartialSum = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "Iterate not support when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + if (unlikely(kfcMsg_.body.isFirstIter)) { + cntIter_ = 0; + cOffset_ = 0; + curProcess = 0; + } else { + if (++cntIter_ >= mnIter_) { + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + return false; + } + if constexpr (!sync) { + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + return true; + } + } + + if constexpr (!sync) { // Asynchronous mode. Only UB. +#if !defined(__DAV_C310__) + ASSERT(cacheWorkspaceAddr != 0); // The cache address must be configured in asynchronous mode. + ASSERT(PhyPosIsUB(C_TYPE::pos)); // Asynchronous mode. Only UB. +#endif + } + + isSyncGetC = sync; + + // Synchronous mode. no cache for the first time + kfcMsg_.body.enPartialSum = enPartialSum; + kfcMsg_.body.sync = sync; + kfcMsg_.body.cAddr = reinterpret_cast(cacheWorkspaceAddr); +#if defined(__DAV_C310__) + kfcMsg_.body.hasSetWorkspace = (cacheWorkspaceAddr != 0); + PrepareABFromGM(); + const bool isTransA = kfcMsg_.body.isTransA; // kfcMsg body will be reset after postMessage + const bool isTransB = kfcMsg_.body.isTransB; + const bool isTransScaleA = kfcMsg_.body.quantMode & 0b01; + const bool isTransScaleB = (kfcMsg_.body.quantMode >> 1) & 0b01; + const bool isBias = kfcMsg_.body.setTensorBias; +#endif + PostMessage(); + SyncCubeWithVec(); +#if defined(__DAV_C310__) + // wait and copy data from UB->L1 + PrepareABFromUb(isTransA, isTransB, isBias, isTransScaleA, isTransScaleB); +#endif + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + return true; + } + + // Only support the mode that the IterateAll is asynchronous and GM output is continuous. + // In discontinuous scenarios, the system stops responding. + __aicore__ inline void WaitIterateAll() + { + ASSERT(!isSyncGetC); // Must be asynchronous mode + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + return; + } +#endif + WaitEvent(this->instIdx); + return; + } +#if defined(__DAV_C310__) + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + CrossCoreWaitFlag(waitFixpId); + } else { + CrossCoreWaitFlag(waitFixpId); + } +#else + auto intraId = this->devEvtID; + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + intraId = this->devEvtID - 1; + } + } + WaitEvent(intraId); +#endif + } + + // Only support the mode that the IterateAll is asynchronous and GM output is continuous. + // In discontinuous scenarios, the system stops responding. + __aicore__ inline void WaitIterateBatch() + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "WaitIterateBatch not support when enableMixDualMaster is enabled"); + ASSERT(!isSyncGetC); // Must be asynchronous mode +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + +#if defined(__DAV_C310__) + template + __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) + { + static_assert(!(ToMatmulConfig(MM_CFG).enableMixDualMaster && !(A_TYPE::ibShare && B_TYPE::ibShare)), + "IBShare in A/BTYPE should be true when enableMixDualMaster is enabled."); + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + cubeObj.cubeObj[0].mul.IterateAll(gm, enAtomic, enSequentialWrite, waitIterateAll, fakeMsg); + if (sync || waitIterateAll) { + CrossCoreSetFlag( + GetIntraFlagId(cubeObj.cubeObj[0].instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 0U)); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(GetIntraFlagId(cubeObj.cubeObj[0].instID, + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 1U)); // 1 means sub_block 1 + } + } + cubeObj.cubeObj[0].mul.End(); + return; + } +#endif + PrepareABFromGM(); + if constexpr (sync) { + CrossCoreWaitFlag(waitFixpId); + } + return; + } + + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + cntIter_ = 0; // input from ub only copy once + } + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + ASSERT(kfcMsg_.body.isFirstIter == 1); + kfcMsg_.body.iterateFakeMsg = fakeMsg; + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.sync = sync; + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.waitIterateAll = waitIterateAll; + + PrepareABFromGM(); + const bool isTransA = kfcMsg_.body.isTransA; // kfcMsg body will be reset after postMsg + const bool isTransB = kfcMsg_.body.isTransB; + const bool isTransScaleA = kfcMsg_.body.quantMode & 0b01; + const bool isTransScaleB = (kfcMsg_.body.quantMode >> 1) & 0b01; + const bool isBias = kfcMsg_.body.setTensorBias; + PostMessage(); + PrepareABFromUb(isTransA, isTransB, isBias, isTransScaleA, isTransScaleB); + if constexpr (sync) { + CrossCoreWaitFlag(waitFixpId); + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } + + template + __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false) + { + static_assert(!(ToMatmulConfig(MM_CFG).enableMixDualMaster && !(A_TYPE::ibShare && B_TYPE::ibShare)), + "IBShare in A/BTYPE should be true when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { + IterateAllCPU(ubCmatrix, enAtomic, enSequentialWrite, waitIterateAll); + PrepareABFromGM(); + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + this->instIdx); + if constexpr (sync) { + CrossCoreWaitFlag(waitFixpId); + } + return; + } + ASSERT(enAtomic == 0); + ASSERT(kfcMsg_.body.isFirstIter == 1); + kfcMsg_.body.cAddr = GetTscmAddr(ubCmatrix); + if (ubCmatrix.GetPosition() == static_cast(TPosition::TSCM)) { + kfcMsg_.body.cIsTscm = 1; + } + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.sync = sync; + kfcMsg_.body.waitIterateAll = waitIterateAll; + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + cntIter_ = 0; + } + ASSERT(kfcMsg_.body.enSequentialWrite == 0); + PrepareABFromGM(); + const bool isTransA = kfcMsg_.body.isTransA; + const bool isTransB = kfcMsg_.body.isTransB; + const bool isTransScaleA = kfcMsg_.body.quantMode & 0b01; + const bool isTransScaleB = (kfcMsg_.body.quantMode >> 1) & 0b01; + const bool isBias = kfcMsg_.body.setTensorBias; + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + this->instIdx); + PostMessage(); + PrepareABFromUb(isTransA, isTransB, isBias, isTransScaleA, isTransScaleB); + if constexpr (sync) { + CrossCoreWaitFlag(waitFixpId); + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } +#else + template + __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { + constexpr uint16_t eventID = 9U; +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + WaitEvent(eventID); + cubeObj.cubeObj[0].mul.IterateAll(gm, enAtomic, enSequentialWrite, waitIterateAll, fakeMsg); + if (sync || waitIterateAll) { + NotifyEvent(cubeObj.cubeObj[0].instID); + } + cubeObj.cubeObj[0].mul.End(); + return; + } +#endif + NotifyEvent(eventID); + if constexpr(sync) { + WaitEvent(this->instIdx); + } + return; + } + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + ASSERT(kfcMsg_.body.isFirstIter == 1); + kfcMsg_.body.iterateFakeMsg = fakeMsg; + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.sync = sync; + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.waitIterateAll = waitIterateAll; + PostMessage(); + SyncCubeWithVec(); + if constexpr (sync) { + auto intraId = this->devEvtID; + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + intraId = this->devEvtID - 1; + } + } + WaitEvent(intraId); + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } + + template + __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0) + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster){ +#if (__CCE_AICORE__ == 220) + ASSERT("IterateAll localTensor not support when enableMixDualMaster is enabled"); +#endif + return; + } + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + ASSERT(sync == true); + ASSERT(enAtomic == 0); + ASSERT(kfcMsg_.body.isFirstIter == 1); + ASSERT((PhyPosIsL1(C_TYPE::pos)) && "IterateAll LocalTensor only support TPosition A1 or B1"); + ASSERT(!(A_TYPE::ibShare && B_TYPE::ibShare) && "IterateAll LocalTensor not support when sameab" + " is enabled"); + if (ubCmatrix.GetPosition() == static_cast(TPosition::TSCM)) { + kfcMsg_.body.cAddr = GetTscmAddr(ubCmatrix); + kfcMsg_.body.cIsTscm = 1; + } else { + kfcMsg_.body.cAddr = GetGlobalAddr(ubCmatrix); + } + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.sync = sync; + ASSERT(kfcMsg_.body.enSequentialWrite == 0); + GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); + PostMessage(); + + if constexpr (sync) { + WaitEvent(this->devEvtID); + CopyToUB(ubCmatrix, gmDataAddr, ubCmatrix.GetSize()); + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } +#endif + + template + __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0, const bool enPartialSum = false, const uint8_t enAtomic = 0) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "IterateBatch not support when enableMixDualMaster is enabled"); + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + ASSERT(kfcMsg_.body.isFirstIter == 1); + ASSERT(!(A_TYPE::ibShare && B_TYPE::ibShare) && "IterateBatch not support when when sameab" + " is enabled"); + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + kfcMsg_.body.batchA = batchA; + kfcMsg_.body.batchB = batchB; + kfcMsg_.body.matrixStrideA = matrixStrideA; + kfcMsg_.body.matrixStrideB = matrixStrideB; + kfcMsg_.body.matrixStrideC = matrixStrideC; + kfcMsg_.body.waitIterateBatch = waitIterateBatch; + kfcMsg_.body.enPartialSum = enPartialSum; + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.setBatch = 1; + + -- Gitee From 9ef0190778befaa4e8fd5ed1d5bb1b8037796003 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 09:10:57 +0000 Subject: [PATCH 53/56] add Signed-off-by: jiangchengcheng-on --- lib/matmul/matmul_client_new2.h | 1520 +++++++++++++++++++++++++++++++ 1 file changed, 1520 insertions(+) create mode 100644 lib/matmul/matmul_client_new2.h diff --git a/lib/matmul/matmul_client_new2.h b/lib/matmul/matmul_client_new2.h new file mode 100644 index 00000000..e46a84ff --- /dev/null +++ b/lib/matmul/matmul_client_new2.h @@ -0,0 +1,1520 @@ +#if defined(__DAV_C310__) + PrepareABFromGM(); +#endif + PostMessage(); + + if constexpr (sync) { +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } + + template + __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0, const bool enPartialSum = false, const uint8_t enAtomic = 0) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "IterateBatch not support when enableMixDualMaster is enabled"); + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + ASSERT(sync == true); + ASSERT(kfcMsg_.body.isFirstIter == 1); + ASSERT(!(A_TYPE::ibShare && B_TYPE::ibShare) && "IterateBatch not support when sameab is enabled"); + if (ubCmatrix.GetPosition() == static_cast(TPosition::TSCM)) { + kfcMsg_.body.cAddr = GetTscmAddr(ubCmatrix); + kfcMsg_.body.cIsTscm = 1; + } else { +#if defined(__DAV_C310__) + kfcMsg_.body.cAddr = reinterpret_cast(ubCmatrix.GetPhyAddr()); +#else + kfcMsg_.body.cAddr = GetGlobalAddr(ubCmatrix); +#endif + } + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + kfcMsg_.body.batchA = batchA; + kfcMsg_.body.batchB = batchB; + kfcMsg_.body.matrixStrideA = matrixStrideA; + kfcMsg_.body.matrixStrideB = matrixStrideB; + kfcMsg_.body.matrixStrideC = matrixStrideC; + kfcMsg_.body.enPartialSum = enPartialSum; + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.setBatch = 1; + GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); +#if defined(__DAV_C310__) + PrepareABFromGM(); + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + this->instIdx); +#endif + PostMessage(); + + if constexpr (sync) { +#if defined(__DAV_C310__) + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + CrossCoreWaitFlag(waitFixpId); + } +#else + WaitEvent(this->devEvtID); + CopyToUB(ubCmatrix, gmDataAddr, ubCmatrix.GetSize()); +#endif + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } + + template + __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0, const bool enPartialSum = false, const uint8_t enAtomic = 0) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "IterateNBatch not support when enableMixDualMaster is enabled."); + static_assert(A_TYPE::layout != LayoutMode::NONE && B_TYPE::layout != LayoutMode::NONE, + "BMM does not support the layout being NONE"); + if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { + return; + } + TRACE_START(TraceId::KFC_CLIENT_POST_MSG); + cntIter_ = 0; + cOffset_ = 0; + curProcess = 0; + ASSERT(kfcMsg_.body.isFirstIter == 1); + ASSERT(cacheWorkspaceAddr); + ASSERT(!(A_TYPE::ibShare && B_TYPE::ibShare) && "IterateNBatch not support when sameab is enabled"); + kfcMsg_.body.cAddr = reinterpret_cast(cacheWorkspaceAddr); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + kfcMsg_.body.batchLoop = batchLoop; + kfcMsg_.body.batchA = batchA; + kfcMsg_.body.batchB = batchB; + kfcMsg_.body.matrixStrideA = matrixStrideA; + kfcMsg_.body.matrixStrideB = matrixStrideB; + kfcMsg_.body.matrixStrideC = matrixStrideC; + kfcMsg_.body.enPartialSum = enPartialSum; + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.setBatch = 1; + kfcMsg_.body.waitIterateBatch = waitIterateBatch; + PostMessage(); + if constexpr (sync) { +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + isSyncGetC = sync; + TRACE_STOP(TraceId::KFC_CLIENT_POST_MSG); + } + +#if defined(__DAV_C310__) + template + __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "GetTensorC not support when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + if (!isSyncGetC) { // Asynchronous + // If the buffer is not configured, the output is stored in the L0C buffer. + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + if constexpr (!IsBasic(ToMatmulConfig(MM_CFG))) { + CrossCoreWaitFlag(waitFixpId); + } + return; + } + } + auto msg = client->AllocMessage(); + msg->body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + uint32_t flag = 0; + flag |= enAtomic; + flag |= (enSequentialWrite << (sizeof(uint8_t) * ONE_BYTE_BIT_SIZE)); + __ssbuf__ uint32_t *ptrMsg = reinterpret_cast<__ssbuf__ uint32_t *>(&(msg->body)); + *ptrMsg = flag; + msg->head = KfcMsgMakeFlag(KFC_Enum::MMFUN_GET_TENSOR_C, this->instIdx); + client->PostMessage(msg); + if constexpr (!(IsBasic(ToMatmulConfig(MM_CFG)) && (A_TYPE::ibShare && B_TYPE::ibShare))) { + CrossCoreWaitFlag(waitFixpId); + } + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + return; + } + + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + + PostMessage(); + if constexpr (sync) { + CrossCoreWaitFlag(waitFixpId); + } + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + } + + template + __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "GetTensorC not support when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_UB); + ASSERT(kfcMsg_.body.isFirstIter == 0); + uint64_t singleSize; + if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) { + singleSize = ToMatmulConfig(MM_CFG).singleCoreMN; + } else { + singleSize = cubeTiling.GetSingleCoreM() * cubeTiling.GetSingleCoreN(); + } + // Asynchronous + if (!isSyncGetC) { + ASSERT(enAtomic == 0); + // check if setworkspace enabled and less than total ub size to decide copy gm to ub or not + if (cacheWorkspaceAddr == 0) { + GetTensorCWithoutGm(c, enAtomic, enSequentialWrite); + return; + } + // If buffer is configured, block MTE2 to ensure that the output can be transported. + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; + CrossCoreWaitFlag(waitFixpId); + } + + uint32_t baseSize; + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + baseSize = ToMatmulConfig(MM_CFG).baseMN * sizeof(typename C_TYPE::T); + } else { + baseSize = cubeTiling.GetBaseM() * cubeTiling.GetBaseN() * sizeof(typename C_TYPE::T); + } + if constexpr (doPad) { + CopyToUBPad(c, cacheWorkspaceAddr + cOffset_, height, width, srcGap, dstGap); + } else { + CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); + } + cOffset_ += baseSize; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); + return; + } + + // Must be the same as Iterate. + ASSERT(sync == true); + ASSERT(enAtomic == 0); + kfcMsg_.body.cAddr = reinterpret_cast(c.GetPhyAddr()); + kfcMsg_.body.sync = 1; + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + this->instIdx); + PostMessage(); + CrossCoreWaitFlag(waitFixpId); + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); + return; + } +#else + // Synchronous interface. The user sends the GM address, which contains 64 bits. + template + __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetTensorC not support when enableMixDualMaster is enabled"); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(isSyncGetC); // The mode must be synchronous. + + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + + PostMessage(); + + if constexpr (sync) { + WaitEvent(this->devEvtID); + } + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + } + + // Synchronous interface + template + __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetTensorC not support when enableMixDualMaster is enabled"); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_UB); + ASSERT(kfcMsg_.body.isFirstIter == 0); + if (!isSyncGetC) { // Asynchronous + ASSERT(cacheWorkspaceAddr); + ASSERT(enAtomic == 0); + + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; + WaitEvent(this->devEvtID); + } + + uint32_t size; + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + size = ToMatmulConfig(MM_CFG).baseMN * sizeof(typename C_TYPE::T); + } else { + size = cubeTiling.GetBaseM() * cubeTiling.GetBaseN() * sizeof(typename C_TYPE::T); + } + if constexpr (doPad) { + CopyToUBPad(c, cacheWorkspaceAddr + cOffset_, height, width, srcGap, dstGap); + } else { + CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); + } + cOffset_ += size; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); + return; + } + + ASSERT(sync == true); // must be the same as Iterate. + ASSERT(enAtomic == 0); + kfcMsg_.body.cAddr = GetGlobalAddr(c); + kfcMsg_.body.sync = 1; + kfcMsg_.body.enAtomic = (uint8_t)(enAtomic); + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + + GM_ADDR gmDataAddr = reinterpret_cast(kfcMsg_.body.cAddr); + PostMessage(); + + WaitEvent(this->devEvtID); + + if constexpr (PhyPosIsUB(C_TYPE::pos)) { + if constexpr (doPad) { + CopyToUBPad(c, (__gm__ DstT*)gmDataAddr, height, width); + } else { + CopyToUB(c, (__gm__ DstT*)gmDataAddr, c.GetSize()); + } + } + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); + return; + } +#endif + + template + __aicore__ inline void GetTensorC(const GlobalTensor& gm, const LocalTensor& co2Local, + uint8_t enAtomic = 0, bool enSequentialWrite = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "GetTensorC not support when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(isSyncGetC); // must synchronization mode + + kfcMsg_.body.cAddr = reinterpret_cast(gm.GetPhyAddr()); + kfcMsg_.body.enAtomic = (uint8_t)enAtomic; + kfcMsg_.body.enSequentialWrite = enSequentialWrite; + kfcMsg_.body.sync = sync; + + PostMessage(); + + if constexpr (sync) { +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + + CopyToUB(co2Local, gm.GetPhyAddr(), co2Local.GetSize()); + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + } + + template + __aicore__ inline GlobalTensor GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "GetTensorC not support when enableMixDualMaster is enabled."); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(!isSyncGetC); // Asynchronous only + ASSERT(cacheWorkspaceAddr); + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + auto intraId = this->devEvtID; + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + intraId = this->devEvtID - 1; + } + } + WaitEvent(intraId); +#endif + } + uint32_t size; + GlobalTensor global; + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + size = ToMatmulConfig(MM_CFG).baseMN * sizeof(typename C_TYPE::T); + global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), + ToMatmulConfig(MM_CFG).baseMN); + } else { + size = cubeTiling.GetBaseM() * cubeTiling.GetBaseN() * sizeof(typename C_TYPE::T); + global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), + cubeTiling.GetBaseM() * cubeTiling.GetBaseN()); + } + cOffset_ += size; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + return global; + } + + template + __aicore__ inline GlobalTensor GetBatchTensorC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetBatchTensorC not support when enableMixDualMaster is enabled"); + GlobalTensor global; + if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { + return global; + } + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(!isSyncGetC); // only support async + ASSERT(cacheWorkspaceAddr); + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + + uint32_t batch = batchA > batchB ? batchA : batchB; + global.SetGlobalBuffer(reinterpret_cast<__gm__ DstT *>(cacheWorkspaceAddr + cOffset_), + batch * cubeTiling.GetSingleCoreM() * cubeTiling.GetSingleCoreN()); + uint32_t size = batch * cubeTiling.GetSingleCoreM() * cubeTiling.GetSingleCoreN() * sizeof(typename C_TYPE::T); + cOffset_ += size; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + return global; + } + + template + __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetBatchC not support when enableMixDualMaster is enabled"); + return GetBatchTensorC(batchA, batchB, enSequentialWrite); + } + + // coordinated use with IterateNBatch, get single IterateBatch outcome + template + __aicore__ inline void GetBatchTensorC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetBatchTensorC not support when enableMixDualMaster is enabled"); + if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { + return; + } + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(cacheWorkspaceAddr); + ASSERT(enSequentialWrite); + ASSERT(!isSyncGetC); // only support async + + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + + uint32_t batch = batchA > batchB ? batchA : batchB; + uint32_t size = batch * cubeTiling.GetSingleCoreM() * cubeTiling.GetSingleCoreN() * sizeof(typename C_TYPE::T); + CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); + cOffset_ += size; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + } + + template + __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "GetBatchC not support when enableMixDualMaster is enabled"); + GetBatchTensorC(c, batchA, batchB, enSequentialWrite); + } + + __aicore__ inline void AsyncGetTensorC(const LocalTensor& c) + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "AsyncGetTensorC not support when enableMixDualMaster is enabled"); + TRACE_START(TraceId::KFC_CLIENT_REV_MSG_GM); + ASSERT(kfcMsg_.body.isFirstIter == 0); + ASSERT(cacheWorkspaceAddr); + ASSERT(!isSyncGetC); + + if (curProcess < INC_PROCESS_CHECK) { + ++curProcess; +#if defined(__DAV_C310__) + CrossCoreWaitFlag(waitFixpId); +#else + WaitEvent(this->devEvtID); +#endif + } + + uint32_t size = cubeTiling.GetBaseM() * cubeTiling.GetBaseN() * sizeof(typename C_TYPE::T); + CopyToUB(c, cacheWorkspaceAddr + cOffset_, c.GetSize()); + cOffset_ += size; + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_GM); + return; + } + + __aicore__ inline void WaitGetTensorC() + { + ASSERT(!ToMatmulConfig(MM_CFG).enableMixDualMaster && + "WaitGetTensorC not support when enableMixDualMaster is enabled"); + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventID); + WaitFlag(eventID); + } + + template + __aicore__ inline MatrixOffset GetOffsetC() + { + if constexpr (isTurnOnDebug) { + static_assert(!isTurnOnDebug, "unsupported!"); + } + } + + __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) + { +#if defined(__DAV_C310__) + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetLocalWorkspace not support when enableMixDualMaster is enabled."); + localWorkspace_ = tmpBuffer; +#endif + } + +#if defined(__DAV_C310__) + using ScaleT = float8_e8m0_t; + + __aicore__ inline void SetTensorScaleA(const GlobalTensor& gm, bool isTransposeScaleA = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleA not support when enableMixDualMaster is enabled."); + ASSERT(isTransposeScaleA <= A_TYPE::isScaleTrans && + "It is not allowed to do scaleA transpose when matmul scaleA transpose is not defined."); + if constexpr (A_TYPE::scaleFormat == CubeFormat::NZ) { + ASSERT(isTransposeScaleA == false && + "It is not allowed to do scaleA transpose when matmul scaleA CubeFormat is NZ."); + } + kfcMsg_.body.quantMode = (kfcMsg_.body.quantMode & 0x10) | (static_cast(isTransposeScaleA) & 0x1); + kfcMsg_.body.quantAddr = reinterpret_cast(gm.address_); + } + + __aicore__ inline void SetTensorScaleA(const LocalTensor& leftMatrix, bool isTransposeScaleA = false) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleA not support when enableMixDualMaster is enabled."); + ASSERT(isTransposeScaleA <= A_TYPE::isScaleTrans && + "It is not allowed to do scaleA transpose when matmul scaleA transpose is not defined."); + if constexpr (A_TYPE::scaleFormat == CubeFormat::NZ) { + ASSERT(isTransposeScaleA == false && + "It is not allowed to do scaleA transpose when matmul scaleA CubeFormat is NZ."); + } + kfcMsg_.body.quantMode = (kfcMsg_.body.quantMode & 0b10) | static_cast(isTransposeScaleA); + if constexpr (PhyMxScalePosIsL1()) { + kfcMsg_.body.quantAddr = GetTscmAddr(leftMatrix); + sizeScaleAmatrix_ = leftMatrix.GetSize() * sizeof(ScaleT); + } else { + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(GetSubBlockIdxImpl(), this->instIdx); + while (!(matmulL1AddrMsg->valid)) { + } + kfcMsg_.body.quantAddr = matmulL1AddrMsg->l1aScaleAddr; + sizeScaleAmatrix_ = leftMatrix.GetSize() * sizeof(ScaleT); + aScaleAddr_ = (uint64_t)leftMatrix.GetPhyAddr(); + } + } + + __aicore__ inline void SetTensorScaleB(const GlobalTensor& gm, bool isTransposeScaleB = true) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleB not support when enableMixDualMaster is enabled."); + ASSERT(isTransposeScaleB <= B_TYPE::isScaleTrans && + "It is not allowed to do scaleB transpose when matmul scaleB transpose is not defined."); + if constexpr (B_TYPE::scaleFormat == CubeFormat::NZ) { + ASSERT(isTransposeScaleB == true && + "It is only allowed to do scaleB transpose when matmul scaleB CubeFormat is NZ."); + } + kfcMsg_.body.quantMode = (kfcMsg_.body.quantMode & 0b10) | (static_cast(isTransposeScaleB) << 1); + kfcMsg_.body.quantScalar = reinterpret_cast(gm.address_); + } + + __aicore__ inline void SetTensorScaleB(const LocalTensor& rightMatrix, bool isTransposeScaleB = true) + { + static_assert(!ToMatmulConfig(MM_CFG).enableMixDualMaster, + "SetTensorScaleB not support when enableMixDualMaster is enabled."); + ASSERT(isTransposeScaleB <= B_TYPE::isScaleTrans && + "It is not allowed to do scaleB transpose when matmul scaleB transpose is not defined."); + if constexpr (B_TYPE::scaleFormat == CubeFormat::NZ) { + ASSERT(isTransposeScaleB == true && + "It is only allowed to do scaleB transpose when matmul scaleB CubeFormat is NZ."); + } + kfcMsg_.body.quantMode = (kfcMsg_.body.quantMode & 0b10) | (static_cast(isTransposeScaleB) << 1); + if constexpr (PhyMxScalePosIsL1()) { + kfcMsg_.body.quantScalar = GetTscmAddr(rightMatrix); + sizeScaleBmatrix_ = rightMatrix.GetSize() * sizeof(ScaleT); + } else { + MSG_POS MsgMatmulL1Addr *matmulL1AddrMsg = + (MSG_POS MsgMatmulL1Addr *)GetMatmulL1AddrMsg(GetSubBlockIdxImpl(), this->instIdx); + while (!(matmulL1AddrMsg->valid)) { + } + kfcMsg_.body.quantScalar = matmulL1AddrMsg->l1bScaleAddr; + sizeScaleBmatrix_ = rightMatrix.GetSize() * sizeof(ScaleT); + bScaleAddr_ = (uint64_t)rightMatrix.GetPhyAddr(); + } + } +#endif + +#if ASCENDC_CPU_DEBUG +public: + // this is useless code just for cpu debug + typename MatmulInstAux::MATMUL cubeObj; +#endif + +private: + GM_ADDR cacheWorkspaceAddr; + // Multiple instances with only one message queue maintained. + // Use shared memory to get the queue. + KfcCommClient* client; + TPipe* tpipe; + MatmulTiling cubeTiling; + KfcMsg kfcMsg_; + + bool isSyncGetC; + uint16_t devEvtID; + uint16_t instIdx; + uint16_t curProcess; + + uint32_t mIter_; + uint32_t nIter_; + uint32_t cntIter_; + uint32_t mnIter_; + uint64_t cOffset_; + +#if defined(__DAV_C310__) + uint32_t sizeAmatrix_; + uint32_t sizeBmatrix_; + LocalTensor localWorkspace_ = LocalTensor(); + uint64_t aAddr_; + uint64_t bAddr_; + uint64_t biasAddr_; + uint8_t waitFixpId; + int32_t singleCoreM_; + int32_t singleCoreN_; + int32_t singleCoreK_; + int32_t c0Size_; + uint32_t sizeScaleAmatrix_; + uint32_t sizeScaleBmatrix_; + uint64_t aScaleAddr_; + uint64_t bScaleAddr_; + uint32_t sizeBiasmatrix_; +#endif + template + friend __aicore__ inline void InitKfcClient(T& cubeObj, U *tiling, TPipe *tpipe, KfcCommClient *client, int instIdx, + GM_ADDR workspace); + template friend struct AscendC::GetCubeObjConfig; + constexpr static bool enableMixDualMaster = ToMatmulConfig(MM_CFG).enableMixDualMaster; + constexpr static bool enableABShare = A_TYPE::ibShare && B_TYPE::ibShare; +private: + __aicore__ inline void InitStatic() + { + if (ToMatmulConfig(MM_CFG).singleCoreM == 0 && this->cubeTiling.IsNull()) { + return; + } + ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); + + *((uint64_t*)&kfcMsg_) = 0; + *((uint64_t*)&(kfcMsg_.body)) = 0; + nIter_ = ConstCeil(this->cubeTiling.GetSingleCoreN(), this->cubeTiling.GetBaseN()); + mIter_ = ConstCeil(this->cubeTiling.GetSingleCoreM(), this->cubeTiling.GetBaseM()); + mnIter_ = nIter_ * mIter_; + cacheWorkspaceAddr = nullptr; +#if defined(__DAV_C310__) + singleCoreM_ = this->cubeTiling.GetSingleCoreM(); + singleCoreN_ = this->cubeTiling.GetSingleCoreN(); + singleCoreK_ = this->cubeTiling.GetSingleCoreK(); + if constexpr (ToMatmulConfig(MM_CFG).isPartialOutput) { + uint32_t kIter = ConstCeil(this->cubeTiling.GetSingleCoreK(), this->cubeTiling.GetBaseK()); + mnIter_ = nIter_ * mIter_ * kIter; + } +#endif + } + +#if !defined(__DAV_C310__) + template __aicore__ inline uint64_t CopyGlobalAddr(GM_ADDR& gmDataAddr, const LocalTensor& data) + { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventID); + WaitFlag(eventID); + + struct DataCopyParams param; + param.blockLen = data.GetSize() / AscendCUtils::GetC0Count(sizeof(T)); + GlobalTensor globalTensor; + globalTensor.SetGlobalBuffer((__gm__ T*)gmDataAddr); + DataCopy(globalTensor, data, param); + + return reinterpret_cast(gmDataAddr); + } + + template __aicore__ inline uint64_t GetGlobalAddr( + const LocalTensor& data) + { + uint64_t size = Ceil(data.GetSize() * sizeof(T), ONE_BLK_SIZE) * ONE_BLK_SIZE; + if constexpr (IsSameType::value) { + size /= INT4_TWO; + } + auto gmDataAddr = client->AllocUB(size, kfcMsg_.ubAddr); + + if constexpr (isCopy) { + return CopyGlobalAddr(gmDataAddr, data); + } + return reinterpret_cast(gmDataAddr); + } +#endif + + template __aicore__ inline uint64_t GetTscmAddr(const LocalTensor& data) + { +#if ASCENDC_CPU_DEBUG + ASSERT(GetTPipePtr() != nullptr && "tpipe cannot be nullptr when matmul client post msg"); + return GetAbsAddr(GetTPipePtr(), data); +#else + return (uint64_t)data.GetPhyAddr(); +#endif + } + +#if defined(__DAV_C310__) + template __aicore__ inline void PostMessage() + { + if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) { + *((uint32_t *)&kfcMsg_.body) = 0; + return; + } + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + client->PostSameABFakeMsg(funID, this->instIdx); + *((uint32_t *)&kfcMsg_.body) = 0; + return; + } + } + auto msg = client->AllocMessage(); + ASSERT(msg != nullptr && "msg cannot be nullptr when matmul client post msg"); + auto msgDst = reinterpret_cast<__ssbuf__ uint64_t *>(&(msg->body)); + auto msgSrc = reinterpret_cast(&(kfcMsg_.body)); + if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector || + (ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0 || + (ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0 || + HasScalePosition::value || HasScalePosition::value) { + SendSSbufData<15>(msgSrc, msgDst); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + SendSSbufData<9>(msgSrc, msgDst); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) { + SendSSbufData<8>(msgSrc, msgDst); + } else if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) { + SendSSbufData<7>(msgSrc, msgDst); + } else { + SendSSbufData<4>(msgSrc, msgDst); + } + if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) { + msg->userCustomData = kfcMsg_.userCustomData; + } + msg->head = KfcMsgMakeFlag(funID, this->instIdx, A_TYPE::ibShare && B_TYPE::ibShare); + client->PostMessage(msg); + // clear flag + *((uint32_t *)&kfcMsg_.body) = 0; + } +#else + template __aicore__ inline void PostMessage() + { + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + ASSERT(DoMatmulNorm(MM_CFG) && "MM_CFG should use norm config when sameab is enabled"); + if (GetSubBlockIdxImpl() == 1) { // Do not send v1's message to cube + *((uint32_t *)&kfcMsg_.body) = 0; // Clear all flag bits. + kfcMsg_.ubAddr = -1; + return; + } + } + kfcMsg_.head = KfcMsgMakeFlag(funID, this->instIdx); + + auto msg = client->AllocMessage(); + ASSERT(msg != nullptr && "msg cannot be nullptr when matmul client post msg"); + + auto tmp1 = reinterpret_cast<__ubuf__ uint64_t*>(client->ubMsg); + auto tmp2 = reinterpret_cast(&kfcMsg_); + for (int i = 0; i < sizeof(kfcMsg_) / sizeof(uint64_t); i++, tmp1++, tmp2++) { + *tmp1 = *tmp2; + } + + client->PostMessage(msg); + + // clear flag + *((uint32_t*)&kfcMsg_.body) = 0; // Clear all flag bits. + kfcMsg_.ubAddr = -1; + } +#endif + +#if defined(__DAV_C310__) + template + __aicore__ inline void IterateAllCPU(const LocalTensor &ubCmatrix, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false) + { +#if ASCENDC_CPU_DEBUG + if ASCEND_IS_AIC { + if constexpr (GetPhyType(C_TYPE::pos) == Hardware::UB) { + CrossCoreWaitFlag( + GetIntraFlagId(cubeObj.cubeObj[0].instID, static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP), 0)); + CrossCoreWaitFlag( + cubeObj.cubeObj[0].instID + static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + INTRA_NUM); + } + cubeObj.cubeObj[0].mul.IterateAll(ubCmatrix, enAtomic); + if (sync || waitIterateAll) { + CrossCoreSetFlag( + GetIntraFlagId(cubeObj.cubeObj[0].instID, static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 0U)); + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + CrossCoreSetFlag(GetIntraFlagId(cubeObj.cubeObj[0].instID, + static_cast(VEC_WAIT_INTRA_Enum::WAIT_FIXP), 1U)); // 1 means sub_block 1 + } + } + cubeObj.cubeObj[0].mul.End(); + return; + } +#endif + } + + __aicore__ inline void GetTensorCWithoutGm(const LocalTensor& c, uint8_t enAtomic = 0, + bool enSequentialWrite = false) + { + // If the cache is not configured, the output is stored in the L0C buffer. + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + if (GetSubBlockIdxImpl() == 1) { + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + + this->instIdx); + CrossCoreWaitFlag(waitFixpId); + return; + } + } + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::WAIT_FIXP) + this->instIdx); + if constexpr (!ToMatmulConfig(MM_CFG).enableMixDualMaster) { + auto msg = client->AllocMessage(); + msg->body.cAddr = reinterpret_cast(c.GetPhyAddr()); + uint32_t flag = 0; + flag |= enAtomic; + flag |= (enSequentialWrite << (sizeof(uint8_t) * ONE_BYTE_BIT_SIZE)); + __ssbuf__ uint32_t *ptrMsg = reinterpret_cast<__ssbuf__ uint32_t *>(&(msg->body)); + *ptrMsg = flag; + msg->head = KfcMsgMakeFlag(KFC_Enum::MMFUN_GET_TENSOR_C, this->instIdx); + client->PostMessage(msg); + } + CrossCoreWaitFlag(waitFixpId); + TRACE_STOP(TraceId::KFC_CLIENT_REV_MSG_UB); + return; + } + + __aicore__ inline void PrepareABFromGM() + { + // there is hidden logic in c220, so only C310 need to check if gm is ready + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::GM || GetPhyType(B_TYPE::pos) == Hardware::GM || + GetPhyType(BIAS_TYPE::pos) == Hardware::GM) { + if constexpr (!A_TYPE::ibShare && !B_TYPE::ibShare) { + // Op sometimes excute ub ->gm in MTE3 before using iterate, can find matched wait flag in matmul server + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::GM_L1_UB_GM)); + } + } + } + + __aicore__ inline void PrepareABFromUb(bool isTransA, bool isTransB, bool isBias, bool isTransScaleA, bool isTransScaleB) + { + constexpr bool isAnyInputFromUb = GetPhyType(A_TYPE::pos) == Hardware::UB || + GetPhyType(B_TYPE::pos) == Hardware::UB || PhyMxScalePosIsUB() || + PhyMxScalePosIsUB() || GetPhyType(BIAS_TYPE::pos) == Hardware::UB; + // Ensure that the condition judgment matches the function WaitAB on the mmserver. + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) && + isAnyInputFromUb) { + // iterate mode only copy ub to l1 once + if (cntIter_ != 0) { + return; + } + } + + if constexpr (isAnyInputFromUb) { + CrossCoreWaitFlag(static_cast(VEC_WAIT_INTRA_Enum::UB_L1_L1_L0AB)); + } + + if constexpr (GetPhyType(A_TYPE::pos) == Hardware::UB) { + if constexpr (ToMatmulConfig(MM_CFG).singleCoreM != 0 && ToMatmulConfig(MM_CFG).singleCoreN != 0 && + ToMatmulConfig(MM_CFG).singleCoreK != 0 && IsStaticPaddingEnable(MM_CFG)) { + CopyUbAToL1StaticTiling(isTransA); + } else { + CopyUbAToL1(isTransA); + } + } + if constexpr (PhyMxScalePosIsUB()) { + CopyScaleUbAToL1(isTransScaleA); + } + if constexpr (GetPhyType(B_TYPE::pos) == Hardware::UB) { + CopyUbBToL1(isTransB); + } + if constexpr (PhyMxScalePosIsUB()) { + CopyScaleUbBToL1(isTransScaleB); + } + if constexpr (GetPhyType(BIAS_TYPE::pos) == Hardware::UB && ToMatmulConfig(MM_CFG).enableSetBias) { + if (isBias) { + CopyUbBiasToL1(); + } + } + if constexpr (isAnyInputFromUb) { + CrossCoreSetFlag(static_cast(CUBE_WAIT_INTRA_Enum::L1_L0AB_UB_L1)); + } + } + + template + __aicore__ inline LocalTensor GetVecTensor(uint64_t addr, const uint64_t size) + { + LocalTensor cLocal; + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::VECCALC); + tbufOutTmp.bufferAddr = addr; +#if __CCE_KT_TEST__ + tbufOutTmp.dataLen = size * sizeof(T); + tbufOutTmp.absAddr = reinterpret_cast(addr); +#endif + cLocal.SetAddr(tbufOutTmp); + return cLocal; + } + + template + __aicore__ inline void NDPadZeros(LocalTensor &dst, const int height, const int width, const int gCol) + { + int32_t calcWidth = Ceil(width, c0Size_); + if (gCol % BLOCK_CUBE) { + int tail = width % c0Size_; + // tail pad zero + if (tail) { + auto offset = width / c0Size_ * c0Size_; + uint64_t mask[2]; + uint16_t mask_tail = ~((1 << tail) - 1); + uint64_t masktail = mask_tail; + mask[0] = + masktail + (masktail << NUM_SIXTEEN) + (masktail << NUM_THIRTYTWO) + (masktail << NUM_FORTYEIGHT); + mask[1] = mask[0]; + int stride = calcWidth * (c0Size_ * sizeof(T) / DEFAULT_C0_SIZE); + int32_t totalRep = Ceil(height, NUM_EIGHT); + if (masktail != 0) { + // duplicate framework not support fp8/hif8, SE suggested pad by int8 0. + if constexpr (IsSameType::value || IsSameType::value || + IsSameType::value) { + LocalTensor tmp = dst.template ReinterpretCast(); + Duplicate(tmp[offset], (int8_t)0, mask, totalRep, stride, NUM_EIGHT * stride); + } else { + Duplicate(dst[offset], (T)0, mask, totalRep, stride, NUM_EIGHT * stride); + } + } + } + } + // If the value of high is not an integer multiple of 16, add 0. + int tailHigh = height % BLOCK_CUBE; + if (tailHigh) { + auto dstOffset = height * calcWidth * BLOCK_CUBE; + if constexpr (IsSameType::value || IsSameType::value || + IsSameType::value) { + // duplicate framework not support fp8/hif8, SE suggested pad by int8 0. + LocalTensor tmp = dst.template ReinterpretCast(); + Duplicate(tmp[dstOffset], (int8_t)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); + } else { + Duplicate(dst[dstOffset], (T)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); + } + } + } + + template + __aicore__ inline void NDTrans2NZ(LocalTensor &dst, const LocalTensor &src, const int calcHigh, + const int calcWidth) + { + // Use Muls, convert to NZ format + struct UnaryRepeatParams intriParams; + uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) }; + intriParams.dstBlkStride = (BLOCK_CUBE * sizeof(T) / DEFAULT_C0_SIZE); + intriParams.srcBlkStride = calcWidth * (BLOCK_CUBE * sizeof(T) / DEFAULT_C0_SIZE); + intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM; + intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM; + int dstOffset = 0; + int srcOffset = 0; + // ensure rep stride be less than 256 + constexpr int maxSrcBlkStride = 32; + constexpr int TWO = 2; + if (intriParams.srcBlkStride >= maxSrcBlkStride) { + intriParams.dstBlkStride = 1; + intriParams.srcBlkStride = 1; + mask[0] = (1 << BLOCK_CUBE) - 1; + mask[1] = 0; + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE; + srcOffset = j * calcWidth * BLOCK_CUBE + i * BLOCK_CUBE; + Muls(dst[dstOffset], src[srcOffset], (T)1, mask, 1, intriParams); + if constexpr (sizeof(T) == sizeof(float)) { + Muls(dst[dstOffset + c0Size_], src[srcOffset + c0Size_], (T)1, mask, 1, intriParams); + } + } + } + } else { + for (int i = 0; i < calcWidth; i++) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE; + srcOffset = i * BLOCK_CUBE; + Muls(dst[dstOffset], src[srcOffset], (T)1, mask, TWO * calcHigh, intriParams); + if constexpr (sizeof(T) == sizeof(float)) { + Muls(dst[dstOffset + c0Size_], src[srcOffset + c0Size_], (T)1, mask, TWO * calcHigh, + intriParams); + } + } + } + } + + template + __aicore__ inline void CopyNDBlock( + const LocalTensor& transTensor, const LocalTensor& src, const int64_t srcOffset, const int height, + const int width) + { + int srcStride = 0; + int blockLen = Ceil(width, c0Size_) * c0Size_ * sizeof(T) / DEFAULT_C0_SIZE; + uint16_t dstStride = 0; + DataCopy(transTensor, src[srcOffset], + { static_cast(height), static_cast(blockLen), static_cast(srcStride), + dstStride }); + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V); + SetFlag((event_t)enQueEvtID); + WaitFlag((event_t)enQueEvtID); + } + + template + __aicore__ inline void CopyUB2L1ND2NZ(LocalTensor& dst, LocalTensor& src, + const uint32_t row, const uint32_t col, const uint32_t gCol, bool isTrans) + { + if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { + if (row % BLOCK_CUBE == 0 && col % BLOCK_CUBE == 0) { + LocalTensor tmpBuffer = localWorkspace_.ReinterpretCast(); + CopyUBND2UBNZ(tmpBuffer, src, row, col, gCol); + DataCopy(dst, tmpBuffer, row * col); + } else { + LocalTensor srcTensor = localWorkspace_[0].template ReinterpretCast(); + srcTensor.SetSize(cubeTiling.GetTransLength()); + CopyNDBlock(srcTensor, src, 0, row, col); + LocalTensor tmpBuffer = + localWorkspace_[cubeTiling.GetTransLength()].template ReinterpretCast(); + int64_t size = Ceil(row, BLOCK_CUBE) * BLOCK_CUBE * Ceil(col, BLOCK_CUBE) * BLOCK_CUBE; + tmpBuffer.SetSize(cubeTiling.GetTransLength()); + CopyUBND2UBNZ(tmpBuffer, srcTensor, row, col, gCol); + DataCopy(dst, tmpBuffer, size); + } + } else { + CopyND2NZOnTheFly(dst, src, row, col, gCol, isTrans); + } + } + + template + __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor &dst, LocalTensor &src, const int height, + const int width, const int gCol, bool isTrans) + { + int tail = width % c0Size_; + int calcWidthExr = Ceil(width, c0Size_); + int calcHeightExr = Ceil(height, BLOCK_CUBE); + + // set2d, pad tail zero + if (height % BLOCK_CUBE != 0) { + int64_t repeat = calcWidthExr * calcHeightExr; + if constexpr (IsTypeOneOfV) { + LocalTensor tmp = dst.template ReinterpretCast(); + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = (uint16_t)repeat; + initConstValueParams.initValue = 0; + InitConstValue(tmp, initConstValueParams); + } else { + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = (uint16_t)repeat; + initConstValueParams.initValue = 0; + InitConstValue(dst, initConstValueParams); + } + + event_t eventIDMte2ToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); + SetFlag(eventIDMte2ToMte3); + WaitFlag(eventIDMte2ToMte3); + } + + // gCol unaligned, can not use dma copy repeat stride + if (tail != 0) { + CopyND2NZOnTheFlyWithTail(dst, src, height, width, gCol, isTrans); + } else { + CopyND2NZOnTheFlyWithoutTail(dst, src, height, width, gCol); + } + } + + template + __aicore__ inline void CopyND2NZOnTheFlyWithTail(const LocalTensor &dst, LocalTensor &src, + const int height, const int width, const int gCol, bool isTrans) + { + int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero + int tail = width % c0Size_; + int dstOffset = 0; + int srcOffset = 0; + int calcWidthExr = Ceil(width, c0Size_); + int calcHeightExr = Ceil(height, BLOCK_CUBE); + // tail elements that need to be pad zero + int blockLen = calcWidthExr * (c0Size_ * sizeof(T) / DEFAULT_C0_SIZE); + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; + + // ub->l1 + int srcGap = gCol * sizeof(T) / ONE_BLK_SIZE - 1; + int srcColOffset = Ceil(gCol, c0Size_) * c0Size_; + if (gCol % c0Size_ || srcGap >= UINT16_MAX) { + // each block len is only 32B + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < height; j++) { + DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_], + src[srcOffset + j * srcColOffset + i * c0Size_], { 1, 1, 0, 0 }, enhancedParams); + } + } + } else { + // data copy stride is aligned + for (int i = 0; i < calcWidth; i++) { + DataCopy(dst[dstOffset], src[srcOffset], + { static_cast(height), 1, static_cast(srcGap), 0 }, enhancedParams); + dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; + srcOffset += c0Size_; + } + } + + // tail gm->ub pad zero, and then ub->l1 + int size = 0; + if constexpr (TAG == InputTypeTag::A) { + size = (isTrans ? singleCoreK_ * NUM_THIRTYTWO : singleCoreM_ * NUM_THIRTYTWO) / sizeof(T); + } else { + size = (isTrans ? singleCoreN_ * NUM_THIRTYTWO : singleCoreK_ * NUM_THIRTYTWO) / sizeof(T); + } + + LocalTensor trans = localWorkspace_.template ReinterpretCast(); + trans.SetSize(size); + + int tailSrcoffset = calcWidth * c0Size_; + // ub->ub + for (int i = 0; i < height; i++) { + DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 }, enhancedParams); + tailSrcoffset += srcColOffset; + } + + event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventIDMte2ToV); + WaitFlag(eventIDMte2ToV); + + TailPadZero(trans, tail, height); + + // ub->l1 + int heightAlignBlock = Ceil(height, BLOCK_CUBE); + int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth; + DataCopy(dst[tailDstOffset], trans, { static_cast(height), 1, 0, 0 }, enhancedParams); + } + + template + __aicore__ inline void TailPadZero(const LocalTensor &trans, int tail, int height) + { + // tail pad zero + uint64_t mask[2]; + uint64_t masktail = ((1 << (c0Size_ - tail)) - 1); + constexpr int byteBitSize = 8; + for (int i = 0; i < sizeof(masktail) * byteBitSize / c0Size_; i++) { + mask[0] += masktail; + masktail <<= c0Size_; + } + mask[1] = mask[0]; + int stride = 8; + if (masktail != 0) { + if constexpr (IsSameType::value || IsSameType::value || + IsSameType::value) { + // duplicate framework not support fp8/hif8, SE suggested pad by int8 0. + LocalTensor tmpTrans = trans.template ReinterpretCast(); + Duplicate(tmpTrans, (int8_t)0, mask, static_cast(Ceil(height, stride)), 1, stride); + } else { + Duplicate(trans, (T)0, mask, static_cast(Ceil(height, stride)), 1, stride); + } + } + + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + } + + template + __aicore__ inline void CopyND2NZOnTheFlyWithoutTail(const LocalTensor &dst, LocalTensor &src, + const int height, const int width, const int gCol) + { + int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero + int dstOffset = 0; + int srcOffset = 0; + int calcHeightExr = Ceil(height, BLOCK_CUBE); + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; + + int srcGap = gCol * sizeof(T) / ONE_BLK_SIZE - 1; + if (gCol % c0Size_ || srcGap >= UINT16_MAX) { + int oriSrcOffset = srcOffset; + int oriDstOffset = dstOffset; + // each block len is only 32B + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < height; j++) { + DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 }, enhancedParams); + dstOffset += c0Size_; + srcOffset += gCol; + } + srcOffset = oriSrcOffset + (i + 1) * c0Size_; + dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_; + } + } else { + // data copy stride is aligned + for (int i = 0; i < calcWidth; i++) { + DataCopy(dst[dstOffset], src[srcOffset], + { static_cast(height), 1, static_cast(srcGap), 0 }, enhancedParams); + dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; + srcOffset += c0Size_; + } + } + } + + template + __aicore__ inline void CopyUBND2UBNZ(LocalTensor& dst, LocalTensor& src, + const uint32_t row, const uint32_t col, const uint32_t gCol) + { + uint32_t calcHigh = Ceil(row, BLOCK_CUBE); + uint32_t calcWidth = Ceil(col, c0Size_); + NDPadZeros(src, row, col, gCol); + NDTrans2NZ(dst, src, calcHigh, calcWidth); + event_t eventIdVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIdVToMTE3); + WaitFlag(eventIdVToMTE3); + } + + template + __aicore__ inline void CopyUB2L1NZ2NZ(const LocalTensor& dst, const LocalTensor& src, + const uint32_t row, const uint32_t col) + { + if constexpr (A_TYPE::format == CubeFormat::NZ && (A_TYPE::ibShare && B_TYPE::ibShare)) { + uint16_t blockCount = static_cast(Ceil(col, c0Size_)); + // Concat V0 and V1 A matrix, then row need devide MIX_NUM; + uint16_t blockLen = static_cast(Ceil(row / MIX_NUM * sizeof(T) * c0Size_, ONE_BLK_SIZE)); + uint16_t dstStride = blockLen; + DataCopy(dst, src, {blockCount, blockLen, 0, dstStride}); + } else { + if constexpr (HasScalePosition::value || HasScalePosition::value) { + if (IsTypeOneOfV) { + DataCopy(dst, src, row * col / INT4_TWO); + } else { + DataCopy(dst, src, row * col); + } + } else { + uint32_t rowAlign = Ceil(row, BLOCK_CUBE) * BLOCK_CUBE; + uint32_t colAlign = Ceil(col, BLOCK_CUBE) * BLOCK_CUBE; + DataCopy(dst, src, rowAlign * colAlign); + } + } + } + + __aicore__ inline void CopyUbAToL1(bool isTrans) + { + LocalTensor leftMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(SrcAT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::A1); + constexpr uint32_t splitNum = 2; + if constexpr (B_TYPE::ibShare && !A_TYPE::ibShare) { + tbufOutTmp.dataLen = Ceil(singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcAT) / splitNum; + } else { + tbufOutTmp.dataLen = Ceil(singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcAT); + } + if ((B_TYPE::ibShare && !A_TYPE::ibShare) && GetSubBlockIdxImpl() == 1 && isTrans) { + tbufOutTmp.bufferAddr = Ceil(singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE / splitNum * ONE_BLK_SIZE; + } else { + tbufOutTmp.bufferAddr = kfcMsg_.body.aAddr; + } +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::A1)) + kfcMsg_.body.aAddr; +#endif + leftMatrix.SetAddr(tbufOutTmp); + LocalTensor a = GetVecTensor(aAddr_, sizeAmatrix_ / sizeof(SrcAT)); + if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::ND) { + if (isTrans) { + CopyUB2L1ND2NZ(leftMatrix, a, singleCoreK_, singleCoreM_, singleCoreM_, isTrans); + } else { + CopyUB2L1ND2NZ(leftMatrix, a, singleCoreM_, singleCoreK_, singleCoreK_, isTrans); + } + } else if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::NZ) { + if (isTrans) { + CopyUB2L1NZ2NZ(leftMatrix, a, singleCoreK_, singleCoreM_); + } else { + CopyUB2L1NZ2NZ(leftMatrix, a, singleCoreM_, singleCoreK_); + } + } else if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::VECTOR) { + if (isTrans) { + ASCENDC_ASSERT((!isTrans), { KERNEL_LOG(KERNEL_ERROR, "A vector does not support transpose.");}); + return; + } + DataCopy(leftMatrix, a[0], {1, static_cast(Ceil(singleCoreK_, c0Size_)), 0, 0}); + } + } + + __aicore__ inline void CopyUbAToL1StaticTiling(bool isTrans) + { + LocalTensor leftMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(SrcAT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::A1); + constexpr uint32_t splitNum = 2; + if constexpr (B_TYPE::ibShare && !A_TYPE::ibShare) { + tbufOutTmp.dataLen = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(ToMatmulConfig(MM_CFG).singleCoreK, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcAT) / splitNum; + } else { + tbufOutTmp.dataLen = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(ToMatmulConfig(MM_CFG).singleCoreK, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcAT); + } + if ((B_TYPE::ibShare && !A_TYPE::ibShare) && GetSubBlockIdxImpl() == 1 && isTrans) { + tbufOutTmp.bufferAddr = Ceil(ToMatmulConfig(MM_CFG).singleCoreK, BLOCK_CUBE) * + BLOCK_CUBE / splitNum * ONE_BLK_SIZE; + } else { + tbufOutTmp.bufferAddr = kfcMsg_.body.aAddr; + } +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::A1)); +#endif + leftMatrix.SetAddr(tbufOutTmp); + LocalTensor a = GetVecTensor(aAddr_, sizeAmatrix_ / sizeof(SrcAT)); + if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::ND) { + if (isTrans) { + CopyUB2L1ND2NZ(leftMatrix, a, ToMatmulConfig(MM_CFG).singleCoreK, + ToMatmulConfig(MM_CFG).singleCoreM, singleCoreM_, isTrans); + } else { + CopyUB2L1ND2NZ(leftMatrix, a, ToMatmulConfig(MM_CFG).singleCoreM, + ToMatmulConfig(MM_CFG).singleCoreK, singleCoreK_, isTrans); + } + } else if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::NZ) { + if (isTrans) { + CopyUB2L1NZ2NZ(leftMatrix, a, ToMatmulConfig(MM_CFG).singleCoreK, ToMatmulConfig(MM_CFG).singleCoreM); + } else { + CopyUB2L1NZ2NZ(leftMatrix, a, ToMatmulConfig(MM_CFG).singleCoreM, ToMatmulConfig(MM_CFG).singleCoreK); + } + } else if constexpr (PhyPosIsUB(A_TYPE::pos) && A_TYPE::format == CubeFormat::VECTOR) { + if (isTrans) { + ASCENDC_ASSERT((!isTrans), { KERNEL_LOG(KERNEL_ERROR, "A vector does not support transpose.");}); + return; + } + DataCopy(leftMatrix, a[0], {1, static_cast(Ceil(singleCoreK_, c0Size_)), 0, 0}); + } + } + + __aicore__ inline void CopyUbBToL1(bool isTrans) + { + LocalTensor rightMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(SrcBT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::B1); + tbufOutTmp.dataLen = Ceil(singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE * sizeof(SrcBT); + tbufOutTmp.bufferAddr = kfcMsg_.body.bAddr; +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::B1)) + kfcMsg_.body.bAddr; +#endif + rightMatrix.SetAddr(tbufOutTmp); + LocalTensor b = GetVecTensor(bAddr_, sizeBmatrix_ / sizeof(SrcBT)); + if constexpr (PhyPosIsUB(B_TYPE::pos) && B_TYPE::format == CubeFormat::ND) { + if (isTrans) { + CopyUB2L1ND2NZ(rightMatrix, b, singleCoreN_, singleCoreK_, singleCoreK_, + isTrans); + } else { + CopyUB2L1ND2NZ(rightMatrix, b, singleCoreK_, singleCoreN_, singleCoreN_, + isTrans); + } + } else if constexpr (PhyPosIsUB(B_TYPE::pos) && B_TYPE::format == CubeFormat::NZ) { + if (isTrans) { + CopyUB2L1NZ2NZ(rightMatrix, b, singleCoreN_, singleCoreK_); + } else { + CopyUB2L1NZ2NZ(rightMatrix, b, singleCoreK_, singleCoreN_); + } + } + } + + __aicore__ inline void CopyUbBiasToL1() + { + LocalTensor biasMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(BiasT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::B1); + tbufOutTmp.dataLen = Ceil(singleCoreM_ * sizeof(BiasT), ONE_BLK_SIZE); + tbufOutTmp.bufferAddr = kfcMsg_.body.biasAddr; +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::B1)) + kfcMsg_.body.biasAddr; +#endif + biasMatrix.SetAddr(tbufOutTmp); + LocalTensor bias = GetVecTensor(biasAddr_, sizeBiasmatrix_ / sizeof(BiasT)); + DataCopy(biasMatrix, bias, {1, static_cast(Ceil(singleCoreN_, c0Size_)), 0, 0}); + } + + __aicore__ inline void CopyScaleUbAToL1(bool isTrans) + { + LocalTensor leftMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(ScaleT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::A1); + tbufOutTmp.dataLen = + Ceil(singleCoreM_, BLOCK_CUBE) * BLOCK_CUBE * Ceil(singleCoreK_, BLOCK_CUBE) * BLOCK_CUBE * sizeof(ScaleT); + tbufOutTmp.bufferAddr = kfcMsg_.body.quantAddr; +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::A1)); +#endif + leftMatrix.SetAddr(tbufOutTmp); + LocalTensor a = GetVecTensor(aScaleAddr_, sizeScaleAmatrix_ / sizeof(ScaleT)); + if constexpr (PhyMxScalePosIsUB()) { + if constexpr(A_TYPE::scaleFormat == CubeFormat::NZ) { + if (isTrans) { + CopyUB2L1NZ2NZ(leftMatrix, a, singleCoreK_ / NUM_THIRTYTWO, singleCoreM_); + } else { + CopyUB2L1NZ2NZ(leftMatrix, a, singleCoreM_, singleCoreK_ / NUM_THIRTYTWO); + } + } + } + } + + __aicore__ inline void CopyScaleUbBToL1(bool isTrans) + { + LocalTensor rightMatrix; + c0Size_ = AscendCUtils::GetC0Count(sizeof(ScaleT)); + TBuffAddr tbufOutTmp; + tbufOutTmp.logicPos = (uint8_t)(TPosition::B1); + tbufOutTmp.dataLen = Ceil(singleCoreK_ / NUM_THIRTYTWO, BLOCK_CUBE) * BLOCK_CUBE * + Ceil(singleCoreN_, BLOCK_CUBE) * BLOCK_CUBE * sizeof(ScaleT); + tbufOutTmp.bufferAddr = kfcMsg_.body.quantScalar; +#if __CCE_KT_TEST__ + tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr(static_cast(TPosition::B1)); +#endif + rightMatrix.SetAddr(tbufOutTmp); + LocalTensor b = GetVecTensor(bScaleAddr_, sizeScaleBmatrix_ / sizeof(ScaleT)); + if constexpr (PhyMxScalePosIsUB()) { + if constexpr(B_TYPE::scaleFormat == CubeFormat::NZ) { + if (isTrans) { + CopyUB2L1NZ2NZ(rightMatrix, b, singleCoreN_, singleCoreK_ / NUM_THIRTYTWO); + } else { + CopyUB2L1NZ2NZ(rightMatrix, b, singleCoreK_ / NUM_THIRTYTWO, singleCoreN_); + } + } + } + } +#endif + + // height width in unit of element + template + __aicore__ inline void CopyToUBPad(const LocalTensor& data, const __gm__ U* addr, uint32_t height = 0, + uint32_t width = 0, uint32_t srcGap = 0, uint32_t dstGap = 0) + { + ASSERT(C_TYPE::format == CubeFormat::ND_ALIGN && + "Only support padding in ND_ALIGN mode, please check template param of GetTensorC."); + + DataCopyParams copyParams{ static_cast(height), static_cast(width * sizeof(T)), + static_cast(srcGap), static_cast(dstGap) }; + DataCopyPadParams padParams{ true, 0, + static_cast( + ConstCeil(width, AscendCUtils::GetC0Count(sizeof(T))) * AscendCUtils::GetC0Count(sizeof(T)) - width), + 0 }; + GlobalTensor globalTensor; + globalTensor.SetGlobalBuffer((__gm__ T*)addr); + DataCopyPad(data, globalTensor, copyParams, padParams); + + if constexpr (sync) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventID); + WaitFlag(eventID); + } + } + + template + __aicore__ inline void CopyToUB(const LocalTensor& data, const __gm__ U* addr, uint32_t size) + { + struct DataCopyParams repeatParams; + repeatParams.blockLen = size / AscendCUtils::GetC0Count(sizeof(T)); + GlobalTensor globalTensor; + globalTensor.SetGlobalBuffer((__gm__ T*)addr); + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + int32_t batchNum = 1; + int32_t offset = 0; + if constexpr (C_TYPE::layout != LayoutMode::NONE) { + int32_t alignedSingleCoreN = ConstCeil(cubeTiling.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(T))) * + AscendCUtils::GetC0Count(sizeof(T)); + offset = cubeTiling.GetSingleCoreM() * alignedSingleCoreN; + batchNum = size / offset; + } + for (int32_t idx = 0; idx < batchNum; ++idx) { + DataCopyParams copyParams{ static_cast(cubeTiling.GetSingleCoreM()), + static_cast(cubeTiling.GetSingleCoreN() * sizeof(T)), 0, 0 }; + DataCopyPadParams padParams{ true, 0, + static_cast(ConstCeil(cubeTiling.GetSingleCoreN(), AscendCUtils::GetC0Count(sizeof(T))) * + AscendCUtils::GetC0Count(sizeof(T)) - + cubeTiling.GetSingleCoreN()), + 0 }; + DataCopyPad(data[idx * offset], globalTensor[idx * offset], copyParams, padParams); + } + } else { + DataCopy(data, globalTensor, repeatParams); + } + + if constexpr (sync) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventID); + WaitFlag(eventID); + } + } + + template + __aicore__ inline uint64_t GetGMAddrAndCopyUB(const __gm__ T* gmDataAddr, const LocalTensor& data) + { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventID); + WaitFlag(eventID); + + struct DataCopyParams param; + param.blockLen = data.GetSize() / AscendCUtils::GetC0Count(sizeof(T)); + GlobalTensor globalTensor; + globalTensor.SetGlobalBuffer((__gm__ T*)gmDataAddr); + DataCopy(globalTensor, data, param); + + return reinterpret_cast(gmDataAddr); + } +}; + +// Match Policy with CallBack paramter +template +class MatmulClient +: public MatmulClientBase { +public: + __aicore__ inline MatmulClient() {} +}; +} // namespace matmul +#endif \ No newline at end of file -- Gitee From 5a7c6b2ff88323316cffa832883023d3384e9827 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 09:32:24 +0000 Subject: [PATCH 54/56] add llt Signed-off-by: jiangchengcheng-on --- .../matmul/iterator/test_batch_n_loop_db.cpp | 36 +++++++++---------- .../batch_scheduler/test_batch_scheduler.cpp | 2 +- .../test_batch_scheduler_single.cpp | 2 +- .../test_batch_scheduler_v200.cpp | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/matmul/iterator/test_batch_n_loop_db.cpp b/tests/matmul/iterator/test_batch_n_loop_db.cpp index 6278eb1d..34eee805 100755 --- a/tests/matmul/iterator/test_batch_n_loop_db.cpp +++ b/tests/matmul/iterator/test_batch_n_loop_db.cpp @@ -9,8 +9,8 @@ */ /*! - * \file test_batch_n_loop.cpp - * \brief n loop ut for batch + * \file test_batch_m_loop.cpp + * \brief m loop ut for batch */ #include @@ -22,7 +22,7 @@ #include "impl/matmul/policy/matmul_private_modules.h" #include "impl/matmul/param/matmul_tensor_info.h" #include "impl/matmul/param/matmul_shape_tiling.h" -#include "impl/matmul/scheduler/iterator/n_loop/n_loop_batch_db.h" +#include "impl/matmul/scheduler/iterator/m_loop/m_loop_batch_db.h" using namespace std; using namespace AscendC; @@ -32,17 +32,17 @@ template { public: - using NLoop = Impl::Detail::NLoop; + using MLoop = Impl::Detail::MLoop; }; template class MatmulImpl : - MATMUL_IMPORT_MODULE(NLoop), + MATMUL_IMPORT_MODULE_PRIVATE(MLoop), MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfo), MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) { - MATMUL_ALLOW_USING(NLoop); + MATMUL_ALLOW_USING_PRIVATE(MLoop); MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo); MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); @@ -66,17 +66,17 @@ public: var.tpipe_ = &pipe; } - void SetInitParams(int32_t singleCoreN, int32_t baseN, int32_t stepN) { - MATMUL_MODULE(MatmulShapeInfo)->SetSingleCoreN(singleCoreN); - tiling.singleCoreN = singleCoreN; - tiling.baseN = baseN; - tiling.stepN = stepN; - tiling.iterateOrder = 1; + void SetInitParams(int32_t singleCoreM, int32_t baseM, int32_t stepM) { + MATMUL_MODULE(MatmulShapeInfo)->SetSingleCoreM(singleCoreM); + tiling.singleCoreM = singleCoreM; + tiling.baseM = baseM; + tiling.stepM = stepM; + tiling.iterateOrder = 0; } int32_t GetSingleShape() { - return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(); } private: @@ -86,7 +86,7 @@ private: }; } -class TestNLoopBatchDB : public testing::Test { +class TestMLoopBatchDB : public testing::Test { protected: void SetUp() {} void TearDown() {} @@ -99,12 +99,12 @@ private: using BIAS_TYPE = MatmulType; constexpr MatmulConfig static CFG_NORM_OUTER_PRODUCT = GetNormalConfig(false, false, false, BatchMode::BATCH_LESS_THAN_L1, - true, IterateOrder::ORDER_N, ScheduleType::OUTER_PRODUCT); + true, IterateOrder::ORDER_M, ScheduleType::OUTER_PRODUCT); MatmulImpl mm; MatmulImpl mm1; }; -TEST_F(TestNLoopBatchDB, batch_n_loop) { +TEST_F(TestMLoopBatchDB, batch_m_loop) { mm.SetInitParams(77, 80, 1); mm.Init(mm.GetSingleShape()); for (mm.OuterStart(); !mm.OuterEnd(); mm.OuterNext()) { @@ -115,7 +115,7 @@ TEST_F(TestNLoopBatchDB, batch_n_loop) { EXPECT_EQ(mm.GetBaseBlockShape(), 5); } -TEST_F(TestNLoopBatchDB, batch_n_loop_case2) { +TEST_F(TestMLoopBatchDB, batch_m_loop_case2) { mm.SetInitParams(81, 32, 1); mm.Init(mm.GetSingleShape()); for (mm.OuterStart(); !mm.OuterEnd(); mm.OuterNext()) { @@ -126,7 +126,7 @@ TEST_F(TestNLoopBatchDB, batch_n_loop_case2) { EXPECT_EQ(mm.GetBaseBlockShape(), 2); } -TEST_F(TestNLoopBatchDB, batch_n_loop_inner_case1) { +TEST_F(TestMLoopBatchDB, batch_m_loop_inner_case1) { mm1.SetInitParams(81, 32, 1); mm1.Init(mm1.GetSingleShape()); for (mm1.InnerStart(); !mm1.InnerEnd(); mm1.InnerNext()) { diff --git a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp index 003164f3..0113153f 100644 --- a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp +++ b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler.cpp @@ -139,7 +139,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using BatchCopyCubeInA = CustomCopyCubeIn, MM_CFG>; using BatchCopyCubeInB = CustomCopyCubeIn, MM_CFG>; using BatchScheduler = Impl::Detail::BatchScheduler; diff --git a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_single.cpp b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_single.cpp index 34322073..9605fa1e 100644 --- a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_single.cpp +++ b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_single.cpp @@ -88,7 +88,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using BatchScheduler = Impl::Detail::BatchScheduler; diff --git a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_v200.cpp b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_v200.cpp index 006ea8a0..95de69d2 100644 --- a/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_v200.cpp +++ b/tests/matmul/scheduler/batch_scheduler/test_batch_scheduler_v200.cpp @@ -139,7 +139,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using BatchCopyCubeInA = CustomCopyCubeIn, MM_CFG>; using BatchCopyCubeInB = CustomCopyCubeIn, MM_CFG>; using BatchScheduler = Impl::Detail::BatchScheduler; -- Gitee From e3082b458c748d638e49dc71f234d785e89e55aa Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 09:34:41 +0000 Subject: [PATCH 55/56] add llt Signed-off-by: jiangchengcheng-on --- .../bias_scheduler/test_bias_scheduler_batch.cpp | 2 +- .../scheduler/bias_scheduler/test_bias_scheduler_v200.cpp | 2 +- tests/matmul/scheduler/fake_modules.h | 8 ++++---- tests/matmul/scheduler/test_scheduler_intrablock.cpp | 2 +- tests/matmul/scheduler/test_scheduler_mdl.cpp | 2 +- .../matmul/scheduler/test_scheduler_mdl_outer_product.cpp | 2 +- tests/matmul/scheduler/test_scheduler_n_buffer.cpp | 2 +- tests/matmul/scheduler/test_scheduler_norm.cpp | 2 +- .../scheduler/test_scheduler_norm_outer_product.cpp | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_batch.cpp b/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_batch.cpp index 8dab33dc..85fcfd61 100644 --- a/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_batch.cpp +++ b/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_batch.cpp @@ -33,7 +33,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyBiasIn = Impl::Detail::CopyBiasIn; using C1Buffer = Impl::Detail::C1Buffer; using C2Buffer = Impl::Detail::C2Buffer; diff --git a/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_v200.cpp b/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_v200.cpp index 7087a7de..a7cedb58 100644 --- a/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_v200.cpp +++ b/tests/matmul/scheduler/bias_scheduler/test_bias_scheduler_v200.cpp @@ -78,7 +78,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CubeOutBuffer = CustomCubeOutBuffer; using CopyBiasIn = Impl::Detail::CopyBiasIn; using LoadBias2C2 = Impl::Detail::LoadBias2C2; diff --git a/tests/matmul/scheduler/fake_modules.h b/tests/matmul/scheduler/fake_modules.h index 891b2343..a6a90a7f 100644 --- a/tests/matmul/scheduler/fake_modules.h +++ b/tests/matmul/scheduler/fake_modules.h @@ -62,7 +62,7 @@ public: __aicore__ inline void ResetCache() {} __aicore__ inline void EnQue() {} - + __aicore__ inline void DeQue() {} __aicore__ inline void Free() { @@ -130,7 +130,7 @@ public: __aicore__ inline LocalTensor GetTensor() { LocalTensor out; - out.SetAddr({ .logicPos = 1, }); + out.SetAddr({ .logicPos = 2, }); out.SetSize(1); return out; } @@ -183,7 +183,7 @@ template ::Type; + using SrcT = typename GetMmDstType::Type; public: template __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, @@ -271,4 +271,4 @@ public: return globalMatrix; } -}; \ No newline at end of file +}; diff --git a/tests/matmul/scheduler/test_scheduler_intrablock.cpp b/tests/matmul/scheduler/test_scheduler_intrablock.cpp index 9b2d13a2..d14a1890 100644 --- a/tests/matmul/scheduler/test_scheduler_intrablock.cpp +++ b/tests/matmul/scheduler/test_scheduler_intrablock.cpp @@ -33,7 +33,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using Scheduler = Impl::Detail::MatmulScheduler; diff --git a/tests/matmul/scheduler/test_scheduler_mdl.cpp b/tests/matmul/scheduler/test_scheduler_mdl.cpp index d5a6817b..eb4b8e60 100644 --- a/tests/matmul/scheduler/test_scheduler_mdl.cpp +++ b/tests/matmul/scheduler/test_scheduler_mdl.cpp @@ -28,7 +28,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using MLoop = CustomLoop; diff --git a/tests/matmul/scheduler/test_scheduler_mdl_outer_product.cpp b/tests/matmul/scheduler/test_scheduler_mdl_outer_product.cpp index f3a65b1b..ee382690 100644 --- a/tests/matmul/scheduler/test_scheduler_mdl_outer_product.cpp +++ b/tests/matmul/scheduler/test_scheduler_mdl_outer_product.cpp @@ -33,7 +33,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using LoadToA2 = CustomLoadToL0; diff --git a/tests/matmul/scheduler/test_scheduler_n_buffer.cpp b/tests/matmul/scheduler/test_scheduler_n_buffer.cpp index c5ffa12f..0e2c57c6 100644 --- a/tests/matmul/scheduler/test_scheduler_n_buffer.cpp +++ b/tests/matmul/scheduler/test_scheduler_n_buffer.cpp @@ -28,7 +28,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using MLoop = CustomLoop; diff --git a/tests/matmul/scheduler/test_scheduler_norm.cpp b/tests/matmul/scheduler/test_scheduler_norm.cpp index 568a6541..fe0cea2f 100644 --- a/tests/matmul/scheduler/test_scheduler_norm.cpp +++ b/tests/matmul/scheduler/test_scheduler_norm.cpp @@ -29,7 +29,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using Scheduler = Impl::Detail::MatmulScheduler; diff --git a/tests/matmul/scheduler/test_scheduler_norm_outer_product.cpp b/tests/matmul/scheduler/test_scheduler_norm_outer_product.cpp index 80be0a7b..a13f6999 100644 --- a/tests/matmul/scheduler/test_scheduler_norm_outer_product.cpp +++ b/tests/matmul/scheduler/test_scheduler_norm_outer_product.cpp @@ -33,7 +33,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using LoadToA2 = CustomLoadToL0; -- Gitee From 11b187c870ccf23d76233b8619ad9cc11679a9e2 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Tue, 20 May 2025 09:37:25 +0000 Subject: [PATCH 56/56] llt Signed-off-by: jiangchengcheng-on --- .../scheduler/test_scheduler_special_mdl.cpp | 4 ++-- tests/matmul/test_operator_matmul_v220.cpp | 2 +- tests/matmul/test_operator_matmul_v300.cpp | 8 +++++++ tests/matmul/utils/test_custom_loop.h | 23 +++++++++---------- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/matmul/scheduler/test_scheduler_special_mdl.cpp b/tests/matmul/scheduler/test_scheduler_special_mdl.cpp index 7bb096fd..eb33c526 100644 --- a/tests/matmul/scheduler/test_scheduler_special_mdl.cpp +++ b/tests/matmul/scheduler/test_scheduler_special_mdl.cpp @@ -27,7 +27,7 @@ template { public: - using L0cT = typename GetDstType::Type; + using L0cT = typename GetMmDstType::Type; using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; using CopyCubeInB = CustomCopyCubeIn, MM_CFG>; using Scheduler = Impl::Detail::MatmulScheduler; @@ -168,4 +168,4 @@ TYPED_TEST(TestSchedulerSpecialMDL, ScheduleOnce_OrderN) { this->mm.SetBias(1); ASSERT_TRUE(this->mm.ScheduleOnce(false)); ASSERT_FALSE(this->mm.ScheduleOnce(false)); -} \ No newline at end of file +} diff --git a/tests/matmul/test_operator_matmul_v220.cpp b/tests/matmul/test_operator_matmul_v220.cpp index af76361f..b043a672 100644 --- a/tests/matmul/test_operator_matmul_v220.cpp +++ b/tests/matmul/test_operator_matmul_v220.cpp @@ -385,7 +385,7 @@ __aicore__ inline void main_kernel_matmul_for_L0cBufferExtend(GM_ADDR aGM, GM_AD using B_T = typename B_TYPE::T; using C_T = typename C_TYPE::T; using BiasT = typename BIAS_TYPE::T; - using L0cT = typename AscendC::GetDstType::Type; + using L0cT = typename AscendC::GetMmDstType::Type; typedef MatmulType cType; set_atomic_none(); diff --git a/tests/matmul/test_operator_matmul_v300.cpp b/tests/matmul/test_operator_matmul_v300.cpp index 8afe5865..bc40f4ee 100644 --- a/tests/matmul/test_operator_matmul_v300.cpp +++ b/tests/matmul/test_operator_matmul_v300.cpp @@ -276,6 +276,14 @@ __aicore__ inline void kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_A outputC = resultCMatrix.AllocTensor(); } + // MOCKER(Fixpipe, void (*)(const C_Tensor &, const LocalTensor &, const FixpipeParams &)) + // .expects(atLeast(1)); + + // MOCKER((&MacroMatmul::MmadMacro)).expects(atLeast(1)); + + // MOCKER((&MacroMatmul::LoadL12L0A)).expects(atLeast(1)); + // MOCKER((&MacroMatmul::LoadL12L0B)).expects(atLeast(1)); + mm.IterateAll(outputC); if constexpr (aType::pos == TPosition::VECCALC) { leftMatrix.FreeTensor(inputA); diff --git a/tests/matmul/utils/test_custom_loop.h b/tests/matmul/utils/test_custom_loop.h index e1272830..26c13cde 100644 --- a/tests/matmul/utils/test_custom_loop.h +++ b/tests/matmul/utils/test_custom_loop.h @@ -1,7 +1,7 @@ #include namespace TestCustomModules { -template +template class CustomLoop { public: @@ -115,33 +115,32 @@ public: return 1; } - __aicore__ inline bool FirstOuterIter() const + __aicore__ inline int32_t InnerStart() { - return true; + return 0; } - __aicore__ inline bool LastOuterIter() const + __aicore__ inline bool InnerNext() { - return true; + return false; } - __aicore__ inline int32_t InnerStart() + __aicore__ inline bool InnerEnd() { - return 0; + return true; } - __aicore__ inline bool FirstInnerIter() const + __aicore__ inline bool FirstOuterIter() const { return true; } - - __aicore__ inline bool InnerNext() + __aicore__ inline bool LastOuterIter() const { - return false; + return true; } - __aicore__ inline bool InnerEnd() + __aicore__ inline bool FirstInnerIter() const { return true; } -- Gitee