diff --git a/impl/kfc/kernel_kfc.h b/impl/kfc/kernel_kfc.h index c12757a4409693f632537cb5f5db2fdfa22d333e..22173583d775fb8d9fd4b16808a7a4bc2f95061d 100644 --- a/impl/kfc/kernel_kfc.h +++ b/impl/kfc/kernel_kfc.h @@ -271,5 +271,6 @@ __aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const } } }; // namespace AscendC - +// Compatible with the previously used matmul namespace +namespace matmul = Gemm; #endif diff --git a/impl/matmul/matmul_call_back.h b/impl/matmul/matmul_call_back.h index 074cb3b7532a61949d4106a5cfc516a1b0aa7a0d..9796aa31abb66cd1672bf9819e678deb70786e6a 100644 --- a/impl/matmul/matmul_call_back.h +++ b/impl/matmul/matmul_call_back.h @@ -1,36 +1,36 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file matmul_call_back.h - * \brief - */ -#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H -#define LIB_MATMUL_MATMUL_CALL_BACK_H - -namespace matmul { -using namespace AscendC; -template &co1Local, - const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, - void (*CopyA1)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK, - const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, - void (*CopyB1)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN, - const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr> -struct MatmulCallBackFunc { - constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor &co1Local, - const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut; - constexpr static void (*CopyA1Ptr)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, - int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1; - constexpr static void (*CopyB1Ptr)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, - int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1; -}; - -} // namespace matmul -#endif \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_call_back.h + * \brief + */ +#ifndef LIB_MATMUL_MATMUL_CALL_BACK_H +#define LIB_MATMUL_MATMUL_CALL_BACK_H + +namespace Gemm { +using namespace AscendC; +template &co1Local, + const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, + void (*CopyA1)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK, + const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, + void (*CopyB1)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, int useK, int useN, + const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr> +struct MatmulCallBackFunc { + constexpr static void (*DataCopyOutPtr)(const __gm__ void* gm, const LocalTensor &co1Local, + const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = DataCopyOut; + constexpr static void (*CopyA1Ptr)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, + int useM, int useK, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyA1; + constexpr static void (*CopyB1Ptr)(const LocalTensor &bMatrix, const __gm__ void *gm, int row, int col, + int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1; +}; + +} // namespace Gemm +#endif diff --git a/impl/matmul/matmul_constant_tiling_impl.h b/impl/matmul/matmul_constant_tiling_impl.h index bb1e5b125530b47047103ab9901b100860abff5d..31f09d14fc14db5298b061ded439d6f357b90a7d 100644 --- a/impl/matmul/matmul_constant_tiling_impl.h +++ b/impl/matmul/matmul_constant_tiling_impl.h @@ -19,7 +19,7 @@ #include "matmul_utils.h" #include "kernel_operator.h" -namespace matmul { +namespace Gemm { using namespace AscendC; constexpr int32_t C0_BYTE_SIZE = 32; @@ -1139,5 +1139,5 @@ __aicore__ constexpr int32_t GetTransLength(const MatmulConfig &mmCFG, const L1S } return MaxValue(a1Length, b1Length, c1Length, biasLength); } -} // namespace matmul +} // namespace Gemm #endif // IMPL_MATMUL_MATMUL_CONSTANT_TILING_IMPL_H \ No newline at end of file diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 8eb316e06a79df4a3dd3ae82c653cf84a0eca146..015fe49e01be05eb8ac87b051bfad39c7063ebc9 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -20,7 +20,7 @@ #include "../../impl/matmul/modules/matmul_module.h" #include "../../impl/matmul/modules/matmul_param.h" #include "../../impl/matmul/matmul_macro_def.h" -namespace matmul { +namespace Gemm { constexpr int32_t DOUBLE_SIZE = 2; @@ -302,10 +302,10 @@ private: MATMUL_USE_MODULE(BatchCopyCubeInA); MATMUL_USE_MODULE(BatchCopyCubeInB); - using ChosenCopyCubeInA = typename AscendC::Conditional() != CopyCubeInType::BMM, + using ChosenCopyCubeInA = typename AscendC::Conditional() != Impl::Detail::CopyCubeInType::BMM, CopyCubeInA, BatchCopyCubeInA>::type; - using ChosenCopyCubeInB = typename AscendC::Conditional() != CopyCubeInType::BMM, + using ChosenCopyCubeInB = typename AscendC::Conditional() != Impl::Detail::CopyCubeInType::BMM, CopyCubeInB, BatchCopyCubeInB>::type; MATMUL_USE_MODULE(ChosenCopyCubeInA); MATMUL_USE_MODULE(ChosenCopyCubeInB); @@ -431,7 +431,7 @@ private: __aicore__ inline int GetND2NZOffsetB(); private: - typename MatmulParams::PARAMS var; + typename Impl::Detail::MatmulParams::PARAMS var; #if __CCE_AICORE__ < 220 constexpr static int L1Size_ = 1024 * 1024; @@ -3236,8 +3236,8 @@ __aicore__ inline void MatmulImplBase l1AAync; - AsyncTensor l1BAync; + Gemm::Impl::Detail::AsyncTensor l1AAync; + Gemm::Impl::Detail::AsyncTensor l1BAync; // This flag needs to be set to 0 only when the outer axis is cut to K. // Currently, all K processed at a time. if (k == 0) { @@ -3521,8 +3521,8 @@ __aicore__ inline void MatmulImplBase l1AAync; - AsyncTensor l1BAync; + Gemm::Impl::Detail::AsyncTensor l1AAync; + Gemm::Impl::Detail::AsyncTensor l1BAync; if (k == 0) { MatmulInstr::sL0cInit_ = enPartialSum ? 0 : 1; if constexpr (ToMatmulConfig(MM_CFG).doMTE2Preload == 1) { @@ -3760,8 +3760,8 @@ __aicore__ inline void MatmulImplBase l1AAync; - AsyncTensor l1BAync; + Gemm::Impl::Detail::AsyncTensor l1AAync; + Gemm::Impl::Detail::AsyncTensor l1BAync; if constexpr (ToMatmulConfig(MM_CFG).doMTE2Preload == 1) { // preload in M direct if (var.cacheA1Factor_ == 1 && (var.curN_ % var.tiling_.GetStepN() == 0) && @@ -4009,7 +4009,7 @@ __aicore__ inline void MatmulImplBase= (stepKbIdx_ * stepKb)", k, var.minStepK_, var.stepKbIdx_, tilingStepKb); }); - AsyncTensor l1BAync; + Gemm::Impl::Detail::AsyncTensor l1BAync; for (int i = 0; i < var.tiling_.GetStepN(); i++) { int curN = var.curN_ * var.tiling_.GetStepN() + i; @@ -5261,7 +5261,7 @@ __aicore__ inline void MatmulImplBase l1BAync; + Gemm::Impl::Detail::AsyncTensor l1BAync; l1BAync = MATMUL_MODULE(CopyCubeInB)->AsyncLoadData(0, 0, singleK, singleN); uint16_t bl1n = Ceil(singleN, BLOCK_CUBE) * BLOCK_CUBE; uint16_t bl1k; @@ -7389,5 +7389,5 @@ __aicore__ inline void MatmulImplBase -struct MatmulMacroImpl { - __aicore__ inline MatmulMacroImpl() {}; -}; - -#if __CCE_AICORE__ >= 220 -// CFG_NORM -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -// CFG_MDL -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -// CFG_IBSHARE_NORM -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -#elif __CCE_AICORE__ == 200 -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulV200; -}; -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulV200; -}; -#endif - -// MM_CFG_BB -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulBasic; -}; - -} +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_macro_def.h +* \brief +*/ +#ifndef IMPL_MATMUL_MATMUL_MACRO_DEF_H +#define IMPL_MATMUL_MATMUL_MACRO_DEF_H +#include "matmul_utils.h" +#include "matmul_macro_utils.h" +#include "matmul_macro_v220_impl.h" +#include "matmul_macro_v220_l0cache_impl.h" +#include "matmul_macro_v220_basic_impl.h" +#include "matmul_macro_v200_impl.h" +#include "modules/matmul_param.h" + +namespace Gemm { + +/* ************************************************************************************************** + * MatmulMacroImpl * + * ************************************************************************************************* */ +template +struct MatmulMacroImpl { + __aicore__ inline MatmulMacroImpl() {}; +}; + +#if __CCE_AICORE__ >= 220 +// CFG_NORM +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul()>; +}; +// CFG_MDL +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul()>; +}; +// CFG_IBSHARE_NORM +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul; +}; +#elif __CCE_AICORE__ == 200 +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulV200; +}; +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulV200; +}; +#endif + +// MM_CFG_BB +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulBasic; +}; + +} // namespace Gemm #endif // _MATMUL_MACRO_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/matmul_macro_utils.h b/impl/matmul/matmul_macro_utils.h index 9ea71d435c0ed836d8bd27e1daa1e616fcc975ee..4884130422aa6d7a459f30906c6ed74a9270d3c1 100644 --- a/impl/matmul/matmul_macro_utils.h +++ b/impl/matmul/matmul_macro_utils.h @@ -35,7 +35,7 @@ constexpr int32_t SHIFT_48_BIT = 48; constexpr int32_t SHIFT_56_BIT = 56; constexpr int32_t CTRL_51_BIT = 51; constexpr uint8_t padList[4] = {0, 0, 0, 0}; -namespace matmul { +namespace Gemm { __aicore__ inline uint16_t CeilDiv(uint16_t num1, uint16_t num2) { ASSERT(num2 > 0); diff --git a/impl/matmul/matmul_macro_v200_impl.h b/impl/matmul/matmul_macro_v200_impl.h index d8ce2026254571822ce6eb705894f8727b717eec..f4f162e53af7ba54b1d5d4c9cbf1cd784bd5091b 100644 --- a/impl/matmul/matmul_macro_v200_impl.h +++ b/impl/matmul/matmul_macro_v200_impl.h @@ -18,7 +18,7 @@ #include "kernel_operator.h" #include "matmul_macro_utils.h" -namespace matmul { +namespace Gemm { using namespace AscendC; // ===========mad template=================/ @@ -383,5 +383,5 @@ inline __aicore__ void MacroMatmulV200 +class MacroMatmul { +public: + inline __aicore__ MacroMatmul(){}; + inline __aicore__ ~MacroMatmul(); +#ifdef ASCENDC_CPU_DEBUG + // addr + uint64_t L0A_PING = L0A_PING_D; + uint64_t L0A_PONG = L0A_PONG_D; + uint64_t L0B_PING = L0B_PING_D; + uint64_t L0B_PONG = L0B_PONG_D; + uint64_t BIAS_PING = BIAS_PING_D; + uint64_t BIAS_PONG = BIAS_PONG_D; +#else + constexpr static uint64_t L0A_PING = L0A_PING_D; + constexpr static uint64_t L0A_PONG = L0A_PONG_D; + constexpr static uint64_t L0B_PING = L0B_PING_D; + constexpr static uint64_t L0B_PONG = L0B_PONG_D; + constexpr static uint64_t BIAS_PING = BIAS_PING_D; + constexpr static uint64_t BIAS_PONG = BIAS_PONG_D; +#endif + // args + uint16_t sAL1M_; + uint16_t sAL1K_; + uint16_t sAL1MOffset_; + uint16_t sAL1KOffset_; + uint16_t sBL1N_; + uint16_t sBL1K_; + uint16_t sBL1NOffset_; + uint16_t sBL1KOffset_; + uint16_t sL1BiasOffset_; + uint16_t sMadM_; + uint16_t sMadN_; + uint16_t sMadK_; + uint16_t sMad0K_; + uint16_t sL0cInit_; // 0; normal 1:init + uint16_t sL0cLast_; // 0; normal 1:last + uint64_t useL0PingPong_; + // feature map + constexpr static uint16_t sFmH_ = 1; + // state + uint16_t ssAl0PingPongFlag_; + uint16_t ssBl0PingPongFlag_; + constexpr static uint16_t ssBiasFull_ = 0; + uint16_t ssBiasPingPongFlag_; + uint16_t kDirectionAlign_; + // instance args + // 0:format(M, K) + // 1:format(K, M), need set transpose + uint16_t ssAmatrixTranspose_; + // 0:format(K, N), use load3dv2 carry + // 1:format(N, K), use load2d carry + uint16_t ssBmatrixTranspose_; + // 0: no bias + // 1: fp16 + // 2: fp32 + uint16_t biasType_; + constexpr static uint16_t typeSize_ = sizeof(A_T); + A_T aScalar_; + A_T bScalar_; + // tpipe + TBuf l0aBuf_; + TBuf l0bBuf_; + TBuf biasBuf_; +#ifdef ASCENDC_CPU_DEBUG + uint64_t pA; + uint64_t pB; + uint64_t pBias; + bool initFlag = false; +#endif + + inline __aicore__ void Init() {} + inline __aicore__ void Release() {} + inline __aicore__ void ResetCache() {} + template + inline __aicore__ void Compute(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, + int64_t offsetb = 0, uint8_t subIdx = 0, uint16_t sMadMStep = 0, uint16_t sMadNStep = 0, + uint32_t posA = 0, uint32_t posB = 0) {} + template + inline __aicore__ void ComputeWithMdb(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, + uint16_t sMadMStep) {} + template + inline __aicore__ void ComputeWithNdb(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, + uint16_t sMadNStep) {} + inline __aicore__ void InitSetFlag() {} + inline __aicore__ void LoadL12L0BFullLoad(const LocalTensor &l1B, uint8_t subIdx, + uint16_t sMad0K, uint16_t sMadN, uint16_t sBL1N, uint16_t sBL1NOffset, uint16_t sBL1KOffset, + uint16_t offset) {} + inline __aicore__ constexpr static uint16_t GetHwK0() + { + return 0; + } + inline __aicore__ constexpr static madtype GetMode() + { + return F162F32; + } + constexpr static madtype mode_ = GetMode(); +private: + inline __aicore__ void LoadL12L0A(uint64_t k_inner, uint64_t aPoskPtr, uint16_t usedK, + const LocalTensor &l1A, LocalTensor &l0A) {} + inline __aicore__ void LoadL12L0B(uint64_t k_inner, uint16_t usedK, + const LocalTensor &l1B, LocalTensor &l0B) {} + inline __aicore__ void MmadMacro(const LocalTensor &l0A, const LocalTensor &l0B, + const LocalTensor &cMatrix, uint16_t mmadK, uint8_t unitFlag, bool l0c_initial) {} +}; + +} // namespace Gemm +#endif \ No newline at end of file diff --git a/impl/matmul/matmul_macro_v220_l0cache_impl.h b/impl/matmul/matmul_macro_v220_l0cache_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ee535addb4c3d9b752a1ab0fc73e5b2bb5b92004 --- /dev/null +++ b/impl/matmul/matmul_macro_v220_l0cache_impl.h @@ -0,0 +1,1009 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_macro_v220_impl.h + * \brief + */ +#ifndef IMPL_MATMUL_MATMUL_MACRO_V220_L0CACHE_IMPL_H +#define IMPL_MATMUL_MATMUL_MACRO_V220_L0CACHE_IMPL_H + +#include "matmul_macro_v220_intf.h" + +namespace Gemm { +using namespace AscendC; + +// ===========mad template=================/ +// Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset +template +class MacroMatmul> { +public: + inline __aicore__ MacroMatmul(){}; + inline __aicore__ ~MacroMatmul(); +#ifdef ASCENDC_CPU_DEBUG + // addr + uint64_t L0A_PING = L0A_PING_D; + uint64_t L0A_PONG = L0A_PONG_D; + uint64_t L0B_PING = L0B_PING_D; + uint64_t L0B_PONG = L0B_PONG_D; + uint64_t BIAS_PING = BIAS_PING_D; + uint64_t BIAS_PONG = BIAS_PONG_D; +#else + constexpr static uint64_t L0A_PING = L0A_PING_D; + constexpr static uint64_t L0A_PONG = L0A_PONG_D; + constexpr static uint64_t L0B_PING = L0B_PING_D; + constexpr static uint64_t L0B_PONG = L0B_PONG_D; + constexpr static uint64_t BIAS_PING = BIAS_PING_D; + constexpr static uint64_t BIAS_PONG = BIAS_PONG_D; +#endif + // args + uint16_t sAL1M_; + uint16_t sAL1K_; + uint16_t sAL1MOffset_; + uint16_t sAL1KOffset_; + uint16_t sBL1N_; + uint16_t sBL1K_; + uint16_t sBL1NOffset_; + uint16_t sBL1KOffset_; + uint16_t sL1BiasOffset_; + uint16_t sMadM_; + uint16_t sMadN_; + uint16_t sMadK_; + uint16_t sMad0K_; + uint16_t sL0cInit_; // 0; normal 1:init + uint16_t sL0cLast_; // 0; normal 1:last + uint64_t useL0PingPong_; + // feature map + constexpr static uint16_t sFmH_ = 1; + // state + uint16_t ssAl0PingPongFlag_; + uint16_t ssBl0PingPongFlag_; + constexpr static uint16_t ssBiasFull_ = 0; + uint16_t ssBiasPingPongFlag_; + uint16_t kDirectionAlign_; + // instance args + // 0:format(M, K) + // 1:format(K, M), need set transpose + uint16_t ssAmatrixTranspose_; + // 0:format(K, N), use load3dv2 carry + // 1:format(N, K), use load2d carry + uint16_t ssBmatrixTranspose_; + // 0: no bias + // 1: fp16 + // 2: fp32 + uint16_t biasType_; + constexpr static uint16_t typeSize_ = sizeof(A_T); + A_T aScalar_; + A_T bScalar_; + // tpipe + TBuf l0aBuf_; + TBuf l0bBuf_; + TBuf biasBuf_; +#ifdef ASCENDC_CPU_DEBUG + uint64_t pA; + uint64_t pB; + uint64_t pBias; + bool initFlag = false; +#endif + int32_t cachePosA_ { 0 }; + int32_t cachePosB_ { 0 }; + int32_t cacheProcA_ { 0 }; + int32_t cacheProcB_ { 0 }; + + inline __aicore__ void Init(); + inline __aicore__ void Release(); + inline __aicore__ void ResetCache(); + template + inline __aicore__ void Compute(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, + int64_t offsetb = 0, uint8_t subIdx = 0, uint16_t sMadMStep = 0, uint16_t sMadNStep = 0, + uint32_t posA = 0, uint32_t posB = 0); + template + inline __aicore__ void ComputeWithMdb(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, + uint16_t sMadMStep); + template + inline __aicore__ void ComputeWithNdb(const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, + const LocalTensor &cMatrix, const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, + uint16_t sMadNStep); + inline __aicore__ void InitSetFlag(); + inline __aicore__ void LoadL12L0BFullLoad(const LocalTensor &l1B, uint8_t subIdx, + uint16_t sMad0K, uint16_t sMadN, uint16_t sBL1N, uint16_t sBL1NOffset, uint16_t sBL1KOffset, + uint16_t offset); + inline __aicore__ constexpr static uint16_t GetHwK0() + { + if constexpr (IsSameType::value && sizeof(A_T) == sizeof(half)) { + return 16; + } else if constexpr (IsSameType::value && IsSameType::value) { + return 8; + } else if constexpr (IsSameType::value) { + return 32; + } else if constexpr (IsSameType::value) { + return 64; + } + } + inline __aicore__ constexpr static madtype GetMode() { + if constexpr (IsSameType::value && sizeof(A_T) == sizeof(half)) { + return F162F32; + } else if constexpr (IsSameType::value && IsSameType::value) { + return F322F32; + } else if constexpr (IsSameType::value) { + return S82S32; + } else if constexpr (IsSameType::value) { + return S42S32; + } else { + return F162F32; + } + } + constexpr static madtype mode_ = GetMode(); +private: + inline __aicore__ void LoadL12L0A(uint64_t k_inner, uint64_t aPoskPtr, uint16_t usedK, + const LocalTensor &l1A, LocalTensor &l0A); + inline __aicore__ void LoadL12L0B(uint64_t k_inner, uint16_t usedK, + const LocalTensor &l1B, LocalTensor &l0B); + inline __aicore__ void LoadL12L0ACache(uint32_t posA, uint64_t k_inner, uint64_t aPoskPtr, uint16_t usedK, + const LocalTensor &l1AMatrix, LocalTensor &l0a); + template + inline __aicore__ void LoadL12L0BCache(uint32_t posB, uint64_t k_inner, int64_t offsetb, + uint16_t usedK, const LocalTensor &l1BMatrix, LocalTensor &l0b); + inline __aicore__ void MmadMacro(const LocalTensor &l0A, const LocalTensor &l0B, + const LocalTensor &cMatrix, uint16_t mmadK, uint8_t unitFlag, bool l0c_initial); +}; + +template +inline __aicore__ MacroMatmul>::~MacroMatmul() +{ +#ifdef ASCENDC_CPU_DEBUG + if (initFlag) { + free((__ca__ A_T *)pA); + free((__cb__ B_T *)pB); + free((C_T *)pBias); + } +#endif +} + +template +inline __aicore__ void MacroMatmul>::MmadMacro( + const LocalTensor &l0A, const LocalTensor &l0B, const LocalTensor &cMatrix, + uint16_t mmadK, uint8_t unitFlag, bool l0c_initial) +{ + uint16_t madM = sMadM_; + if constexpr (GEMV_MODE >= 1) { + madM = 1; + } else { + if (madM == 1) { + madM = 16; + } + } + + MmadParams mmadParams; + mmadParams.m = madM; + mmadParams.k = mmadK; + mmadParams.n = sMadN_; + mmadParams.unitFlag = unitFlag; + mmadParams.kDirectionAlign = kDirectionAlign_; + if (biasType_) { + mmadParams.cmatrixSource = l0c_initial; + mmadParams.cmatrixInitVal = false; + } else { + mmadParams.cmatrixSource = false; + mmadParams.cmatrixInitVal = l0c_initial; + } + Mmad(cMatrix, l0A, l0B, mmadParams); + + if ((madM / ALIGN_NUM) * (sMadN_ / ALIGN_NUM) < 10) { + PipeBarrier(); + } +} + +template +inline __aicore__ void MacroMatmul>::LoadL12L0A( + uint64_t k_inner, uint64_t aPoskPtr, uint16_t usedK, + const LocalTensor &l1A, LocalTensor &l0A) +{ + if constexpr (GEMV_MODE == 2) { + ASSERT(sMadM_ == 1); + InitConstValueParams initConstValueParams {1, (uint16_t)ConstCeil(sMadK_, BLOCK_CUBE * GetHwK0()), 0, aScalar_}; + InitConstValue(l0A, initConstValueParams); + return; + } + if constexpr (GEMV_MODE == 1) { + int FracSize = BYTE_PER_FRACTAL / sizeof(B_T); + int repeat = CeilDiv(usedK, FracSize); + // aPoskPtr is unit of element + LoadData2dParams loadDataParams; + loadDataParams.repeatTimes = repeat; + loadDataParams.srcStride = 1; + loadDataParams.dstGap = 0; + loadDataParams.ifTranspose = 0; + LoadData(l0A[0], l1A[aPoskPtr], loadDataParams); + return; + } + if (ssAmatrixTranspose_ > 0) { + // K_axis is m direction, and M_axis is k direction in load3d intrin + if constexpr (IsSameType::value) { + uint16_t sMad0MAlign = CeilAlign(sMadM_, GetHwK0()); + uint16_t l0aloop = sMad0MAlign / GetHwK0(); + uint8_t l0aRepeat = CeilDiv(usedK, GetHwK0()); + uint64_t l0aSrcAddrStride = sAL1K_ * GetHwK0() ; + uint64_t l0aDstAddrStride = CeilDiv(usedK, GetHwK0()) * GetHwK0() * GetHwK0(); + +#if __CCE_AICORE__ >= 300 + uint64_t l1aOffset = CeilDiv(sAL1MOffset_, GetHwK0()) * GetHwK0() * GetHwK0() * typeSize_ + + k_inner * l0aRepeat * GetHwK0() * GetHwK0() * typeSize_; +#else + uint8_t l0aRepeatOffset = CeilDiv(sMad0K_, GetHwK0()); + uint64_t l1aOffset = CeilDiv(sAL1KOffset_, GetHwK0()) * GetHwK0() * GetHwK0() * typeSize_ + + k_inner * l0aRepeatOffset * GetHwK0() * GetHwK0() * typeSize_; +#endif + uint64_t l0aOffset = 0; + LoadData2dTransposeParams loadData2dTransposeParams; + loadData2dTransposeParams.startIndex = 0; + loadData2dTransposeParams.repeatTimes = l0aRepeat; + loadData2dTransposeParams.srcStride = 1; + loadData2dTransposeParams.dstGap = 0; + loadData2dTransposeParams.dstFracGap = (uint16_t)(l0aRepeat - 1); + loadData2dTransposeParams.addrMode = inc; + for (uint16_t i = 0; i < l0aloop; ++i) { + LoadDataWithTranspose(l0A[l0aOffset], l1A[l1aOffset], loadData2dTransposeParams); + l1aOffset += l0aSrcAddrStride; + l0aOffset += l0aDstAddrStride; + } + } else { + // format(K, M), K, M need to be 16 aligned for f32 + uint16_t madMAlign = CeilAlign(sMadM_, ALIGN_NUM); + uint16_t usedKAlign = CeilAlign(usedK, HW_M0); + uint16_t sAL1MAlign = CeilAlign(sAL1M_, ALIGN_NUM); + LoadData3DParamsV2Pro loadData3DV2; + loadData3DV2.channelSize = sAL1MAlign; + loadData3DV2.extConfig = ((uint64_t)aPoskPtr << 48) | ((uint64_t)sAL1MOffset_ << 32) | + ((uint64_t)usedKAlign << 16) | (uint64_t)madMAlign; + loadData3DV2.enTranspose = true; +#if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 + if constexpr (IsSameType::value) { + LoadData(l0A[0], l1A[0], loadData3DV2); + } else { + LoadData(l0A[0], l1A[0], loadData3DV2); + } +#else + LoadData(l0A[0], l1A[0], loadData3DV2); +#endif + } + } else { + // format(M, K), K_axis is k direction, and M_axis is m direction in load3d intrin + uint16_t madMAlign = CeilAlign(sMadM_, HW_M0); + uint16_t usedKAlign = CeilAlign(usedK, GetHwK0()); + uint16_t sAL1KAlign = CeilAlign(sAL1K_, GetHwK0()); + LoadData3DParamsV2Pro loadData3DV2; + loadData3DV2.channelSize = sAL1KAlign; + loadData3DV2.extConfig = ((uint64_t)sAL1MOffset_ << 48) | ((uint64_t)aPoskPtr << 32) | + ((uint64_t)madMAlign << 16) | (uint64_t)usedKAlign; +#if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 + if constexpr (IsSameType::value) { + LoadData(l0A[0], l1A[0], loadData3DV2); + } else { + LoadData(l0A[0], l1A[0], loadData3DV2); + } +#else + LoadData(l0A[0], l1A[0], loadData3DV2); +#endif + } +} + +template +inline __aicore__ void MacroMatmul>::InitSetFlag() +{ + SetFlag(EVENT_ID0); + SetFlag(EVENT_ID1); +} + +template +inline __aicore__ void MacroMatmul>::LoadL12L0BFullLoad( + const LocalTensor &l1B, uint8_t subIdx, uint16_t sMad0K, uint16_t sMadN, uint16_t sBL1N, + uint16_t sBL1NOffset, uint16_t sBL1KOffset, uint16_t offset) +{ + auto l0b = l0bBuf_.Get(); + if ((subIdx) != 0) { + l0b = l0b[L0BUF_SIZE / 2 / sizeof(B_T)]; + } + if (ssBmatrixTranspose_ > 0) { + // SET LOAD2D parameters , loop axis: K or M, or 1 + // k is GetHwK0() aligned for f32 + uint16_t sMad0KAlign = CeilAlign(sMad0K, GetHwK0()); + uint16_t kC0 = sMad0KAlign / GetHwK0(); + uint16_t nFraC0 = CeilDiv(sMadN, HW_N0); + uint16_t l0bLoop = 1; + uint64_t l0bSrcAddrStride = 0; + uint64_t l0bDstAddrStride = 0; + uint8_t l0bRepeat = kC0 * nFraC0; + uint16_t l0bSrcstride = 1; + uint16_t l0bDststride = 0; + + if (nFraC0 * HW_N0 == sBL1N) { + l0bLoop = 1; // loop=1 + } else if (nFraC0 >= kC0) { // LOOP is K and repeat is n axis + l0bLoop = kC0; + l0bSrcAddrStride = sBL1N * GetHwK0() * typeSize_; + l0bDstAddrStride = nFraC0 * HW_N0 * GetHwK0() * typeSize_; + l0bRepeat = nFraC0; + + l0bSrcstride = 1; + l0bDststride = 0; + } else { // LOOP is N and repeat is K axis + l0bLoop = nFraC0; + l0bSrcAddrStride = HW_N0 * GetHwK0() * typeSize_; + l0bDstAddrStride = HW_N0 * GetHwK0() * typeSize_; + l0bRepeat = kC0; + + l0bSrcstride = (sBL1N + HW_N0 - 1) / HW_N0; + l0bDststride = nFraC0 - 1; + } + // use load2d for L1_2_L0B + LoadData2dParams loadDataParams; + loadDataParams.repeatTimes = l0bRepeat; + loadDataParams.srcStride = l0bSrcstride; + loadDataParams.dstGap = l0bDststride; + loadDataParams.ifTranspose = 0; + uint64_t l1bOffset = sBL1NOffset * GetHwK0() + sBL1KOffset * sBL1N; + uint64_t l0bOffset = offset; + for (uint64_t i = 0; i < l0bLoop; i++) { + LoadData(l0b[l0bOffset], l1B[l1bOffset], loadDataParams); + l1bOffset += (l0bSrcAddrStride / typeSize_); + l0bOffset += (l0bDstAddrStride / typeSize_); + } + } else { + // use load3dv2 for L1_2_L0B + // n_axis is K direction, need to be 16 aligned + uint16_t kAlign = CeilAlign(sMadN, ALIGN_NUM); + uint16_t mPos = sBL1KOffset; + // channel size need to be 16 aligned + uint16_t cAlign = CeilAlign(sBL1N, ALIGN_NUM); + // k_axis is M direction, need to be HW_M0 aligned + uint16_t mAlign = CeilAlign(sMad0K, HW_M0); + // StepN need to be aligned + + LoadData3DParamsV2Pro loadData3DV2; + loadData3DV2.channelSize = cAlign; + loadData3DV2.extConfig = ((uint64_t)mPos << 48) | ((uint64_t)sBL1NOffset << 32) | + ((uint64_t)mAlign << 16) | (uint64_t)kAlign; + loadData3DV2.fMatrixCtrl = true; +#if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 + if constexpr (IsSameType::value) { + LoadData(l0b[offset], l1B[0], loadData3DV2); + } else { + LoadData(l0b[offset], l1B[0], loadData3DV2); + } +#else + LoadData(l0b[offset], l1B[0], loadData3DV2); +#endif + } +} + +template +inline __aicore__ void MacroMatmul>::LoadL12L0B( + uint64_t k_inner, uint16_t usedK, const LocalTensor &l1B, LocalTensor &l0B) +{ + uint16_t sMad0KAlign = CeilAlign(sMad0K_, GetHwK0()); + uint16_t kC0 = sMad0KAlign / GetHwK0(); + if (ssBmatrixTranspose_ > 0) { + // SET LOAD2D parameters , loop axis: K or M, or 1 + // k is GetHwK0() aligned for f32 + uint16_t nFraC0 = CeilDiv(sMadN_, HW_N0); + uint16_t l0bLoop = 1; + uint64_t l0bSrcAddrStride = 0; + uint64_t l0bDstAddrStride = 0; + uint8_t l0bRepeat = kC0 * nFraC0; + uint16_t l0bSrcstride = 1; + uint16_t l0bDststride = 0; + + if (nFraC0 * HW_N0 == sBL1N_) { + l0bLoop = 1; // loop=1 + } else if (nFraC0 >= kC0) { // LOOP is K and repeat is n axis + l0bLoop = kC0; + l0bSrcAddrStride = sBL1N_ * GetHwK0() * typeSize_; + l0bDstAddrStride = nFraC0 * HW_N0 * GetHwK0() * typeSize_; + l0bRepeat = nFraC0; + + l0bSrcstride = 1; + l0bDststride = 0; + } else { // LOOP is N and repeat is K axis + l0bLoop = nFraC0; + l0bSrcAddrStride = HW_N0 * GetHwK0() * typeSize_; + l0bDstAddrStride = HW_N0 * GetHwK0() * typeSize_; + l0bRepeat = kC0; + + l0bSrcstride = (sBL1N_ + HW_N0 - 1) / HW_N0; + l0bDststride = nFraC0 - 1; + } + // use load2d for L1_2_L0B + LoadData2dParams loadDataParams; + loadDataParams.repeatTimes = l0bRepeat; + loadDataParams.srcStride = l0bSrcstride; + loadDataParams.dstGap = l0bDststride; + loadDataParams.ifTranspose = 0; + uint64_t l1bOffset = sBL1NOffset_ * GetHwK0() + sBL1KOffset_ * sBL1N_ + + k_inner * kC0 * GetHwK0() * sBL1N_; + uint64_t l0bOffset = 0; + for (uint64_t i = 0; i < l0bLoop; i++) { + LoadData(l0B[l0bOffset], l1B[l1bOffset], loadDataParams); + l1bOffset += (l0bSrcAddrStride / typeSize_); + l0bOffset += (l0bDstAddrStride / typeSize_); + } + } else { + if constexpr (IsSameType::value || IsSameType::value) { + // use load2d transpose for L1_2_L0B + uint16_t sMad0KAlign = CeilAlign(usedK, GetHwK0()); + uint16_t l0bloop = sMad0KAlign / GetHwK0(); + uint16_t l0bSrcstride = CeilDiv(sBL1K_, GetHwK0()); + uint16_t l0bRepeat = CeilDiv(sMadN_, GetHwK0()); + uint64_t l0bSrcAddrStride = GetHwK0() * GetHwK0(); + uint64_t l0bDstAddrStride = CeilDiv(sMadN_, 16) * 16 * GetHwK0(); + uint64_t l1bOffset = sBL1NOffset_ * sBL1K_ * typeSize_ + sBL1KOffset_ * GetHwK0() * typeSize_ + + k_inner * kC0 * GetHwK0() * GetHwK0() * typeSize_; + uint64_t l0bOffset = 0; + + LoadData2dTransposeParams loadData2dTransposeParams; + loadData2dTransposeParams.startIndex = 0; + loadData2dTransposeParams.repeatTimes = l0bRepeat; + loadData2dTransposeParams.srcStride = l0bSrcstride; + loadData2dTransposeParams.dstGap = 1; + if constexpr (IsSameType::value) { + loadData2dTransposeParams.dstGap = CeilDiv(GetHwK0(), 16) - 1; + } + loadData2dTransposeParams.dstFracGap = 0; + loadData2dTransposeParams.addrMode = inc; + + for (uint64_t i = 0; i < l0bloop; i++) { + LoadDataWithTranspose(l0B[l0bOffset], l1B[l1bOffset], loadData2dTransposeParams); + l1bOffset += l0bSrcAddrStride; + l0bOffset += l0bDstAddrStride; + } + } else { + // use load3dv2 for L1_2_L0B + // n_axis is K direction, need to be 16 aligned + uint16_t kAlign = CeilAlign(sMadN_, ALIGN_NUM); + uint16_t mPos = sBL1KOffset_ + k_inner * sMad0K_; + // channel size need to be 16 aligned + uint16_t cAlign = CeilAlign(sBL1N_, ALIGN_NUM); + // k_axis is M direction, need to be HW_M0 aligned + uint16_t mAlign = CeilAlign(usedK, HW_M0); + // StepN need to be aligned + LoadData3DParamsV2Pro loadData3DV2; + loadData3DV2.channelSize = cAlign; + loadData3DV2.extConfig = ((uint64_t)mPos << 48) | ((uint64_t)sBL1NOffset_ << 32) | + ((uint64_t)mAlign << 16) | (uint64_t)kAlign; + loadData3DV2.fMatrixCtrl = true; +#if __CCE_AICORE__ >= 220 && __CCE_AICORE__ != 310 + if constexpr (IsSameType::value) { + LoadData(l0B[0], l1B[0], loadData3DV2); + } else { + LoadData(l0B[0], l1B[0], loadData3DV2); + } +#else + LoadData(l0B[0], l1B[0], loadData3DV2); +#endif + } + } +} + +// initialization +template +inline __aicore__ void MacroMatmul>::Init() +{ + if constexpr (unlikely(UNIFLAG_EN)) { + SetMMLayoutTransform(0); + } +#ifdef ASCENDC_CPU_DEBUG + // allocate 64K L0A space for cpu debug + pA = (uint64_t)((__ca__ A_T *)malloc(L0AUF_SIZE)); + // allocate 64K L0B space for cpu debug + pB = (uint64_t)((__cb__ B_T *)malloc(L0BUF_SIZE)); + pBias = (uint64_t)((C_T *)malloc(BIAS_BUF_SIZE)); + initFlag = true; + L0A_PING += pA; + L0A_PONG += pA; + L0B_PING += pB; + L0B_PONG += pB; + BIAS_PING += pBias; + BIAS_PONG += pBias; +#endif + ssAl0PingPongFlag_ = 0; + ssBl0PingPongFlag_ = 0; + // close D,bias address need taked from Xd[63:32] + ssBiasPingPongFlag_ = 0; + ssAmatrixTranspose_ = 0; + ssBmatrixTranspose_ = 0; + biasType_ = 0; + + kDirectionAlign_ = 0; + + + sL0cInit_ = 1; + sL0cLast_ = 0; + + GetTPipePtr()->InitBuffer(l0aBuf_, L0AUF_SIZE); + GetTPipePtr()->InitBuffer(l0bBuf_, L0BUF_SIZE); + GetTPipePtr()->InitBuffer(biasBuf_, BIAS_BUF_SIZE); +} + +template +inline __aicore__ void MacroMatmul>::Release() +{ + WaitFlag(EVENT_ID0); + WaitFlag(EVENT_ID1); +} + +template +inline __aicore__ void MacroMatmul>::ResetCache() +{ + cachePosA_ = 0; + cachePosB_ = 0; + cacheProcA_ = 0; + cacheProcB_ = 0; +} + +template +inline __aicore__ void MacroMatmul>::LoadL12L0ACache( + uint32_t posA, uint64_t k_inner, uint64_t aPoskPtr, uint16_t usedK, + const LocalTensor &l1AMatrix, LocalTensor &l0a) +{ + bool hitCachePing = posA == cachePosA_ ? true : false; + bool hitCachePong = posA == cachePosA_ + ssAl0PingPongFlag_ ? true : false; + + // update cache ping or pong + if (hitCachePing) { + ssAl0PingPongFlag_ = 0; + } else if (hitCachePong) { + ssAl0PingPongFlag_ = 1; + } else { + ssAl0PingPongFlag_ = cacheProcA_ % DB_FACTOR == 0 ? 0 : 1; + } + + if (ssAl0PingPongFlag_) { + if constexpr (IsSameType::value) { + l0a = l0a[L0AUF_SIZE / sizeof(A_T)]; + } else { + l0a = l0a[L0AUF_SIZE / DB_FACTOR / sizeof(A_T)]; + } + } + WaitFlag(ssAl0PingPongFlag_); + + if (cacheProcA_ == 0 && posA == 0) { + ++cacheProcA_; + return LoadL12L0A(k_inner, aPoskPtr, usedK, l1AMatrix, l0a); + } + if (!hitCachePing && !hitCachePong) { + LoadL12L0A(k_inner, aPoskPtr, usedK, l1AMatrix, l0a); + ++cacheProcA_; + if (ssAl0PingPongFlag_ == 0) { + cachePosA_ = posA; + } + } +} + +template +template +inline __aicore__ void MacroMatmul>::LoadL12L0BCache( + uint32_t posB, uint64_t k_inner, int64_t offsetb, uint16_t usedK, + const LocalTensor &l1BMatrix, LocalTensor &l0b) +{ + bool hitCachePing = posB == cachePosB_ ? true : false; + bool hitCachePong = posB == cachePosB_ + ssBl0PingPongFlag_ ? true : false; + + // update cache ping or pong + if (hitCachePing) { + ssBl0PingPongFlag_ = 0; + } else if (hitCachePong) { + ssBl0PingPongFlag_ = 1; + } else { + ssBl0PingPongFlag_ = cacheProcB_ % DB_FACTOR == 0 ? 0 : 1; + } + + if (ssBl0PingPongFlag_) { + if constexpr (IsSameType::value) { + if constexpr (!intraBlockPartSum) { + l0b = l0b[L0BUF_SIZE / sizeof(B_T)]; + } + } else { + if constexpr (!intraBlockPartSum) { + l0b = l0b[L0BUF_SIZE / DB_FACTOR / sizeof(B_T)]; + } + } + } + + if (cacheProcB_ == 0 && posB == 0) { + ++cacheProcB_; + return LoadL12L0B(k_inner, usedK, l1BMatrix, l0b); + } + if (!hitCachePing && !hitCachePong) { + if constexpr (!intraBlockPartSum) { + LoadL12L0B(k_inner, usedK, l1BMatrix, l0b); + } else { + l0b = l0b[offsetb]; + } + ++cacheProcB_; + if (ssBl0PingPongFlag_ == 0) { + cachePosB_ = posB; + } + } +} + +template +template +inline __aicore__ void MacroMatmul>::Compute( + const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, + const LocalTensor &bias, int64_t offsetb, uint8_t subIdx, uint16_t sMadMStep, uint16_t sMadNStep, + uint32_t posA, uint32_t posB) +{ + uint16_t madKC0 = CeilDiv(sMadK_, GetHwK0()); + uint32_t nFraC0 = CeilDiv(sMadN_, HW_N0); + uint64_t kC0 = sMad0K_ / GetHwK0(); + uint64_t kLoop; + if constexpr (noTail) { + kLoop = 1; + } else { + kLoop = sMadK_ / sMad0K_; // loop times of sMad0K_ + } + uint64_t kC0Tail = madKC0 - kLoop * kC0; // tail block loop times, unit is 16 + uint64_t kTail; + if constexpr (noTail) { + kTail = 0; + } else { + kTail = sMadK_ - kLoop * sMad0K_; + } + + // m db + if constexpr (scheduleType == ScheduleType::OUTER_PRODUCT && iterateOrder == IterateOrder::ORDER_N) { + ComputeWithMdb(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail, sMadMStep); + return; + } + // n db + if constexpr (scheduleType == ScheduleType::OUTER_PRODUCT && iterateOrder == IterateOrder::ORDER_M) { + ComputeWithNdb(l1AMatrix, l1BMatrix, cMatrix, bias, kC0Tail, kTail, sMadNStep); + return; + } + + if (ssAmatrixTranspose_ > 0) { + if (mode_ == F322F32) { + kDirectionAlign_ = 1; + } + uint16_t wAlign = CeilAlign(sAL1K_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } else { + // fmatrix w should be 16 aligned + uint16_t wAlign = CeilAlign(sAL1M_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } + + if (ssBmatrixTranspose_ < 1) { + uint16_t wAlign = CeilAlign(sBL1K_, HW_M0); + Load3DSetFMatrixBCal(sFmH_, wAlign, padList); + } + + if constexpr (!noBias) { + if ((biasType_) && (sL0cInit_) && (ssBiasFull_ == 0)) { + WaitFlag(2); + uint16_t lenBurst = (sMadN_ * biasType_ * 2 + 63) / 64; + LocalTensor biasC2; + biasC2 = biasBuf_.Get(); + DataCopy(biasC2, bias[sL1BiasOffset_ * biasType_ * 2], {1, lenBurst, 0, 0}); + SetFlag(2); + WaitFlag(2); + } + } + + LocalTensor l0a; + LocalTensor l0b; + for (uint64_t k_inner = 0; k_inner < kLoop; k_inner++) { + l0a = l0aBuf_.Get(); + l0b = l0bBuf_.Get(); + if constexpr(intraBlockPartSum) { + if ((subIdx) != 0) { + l0b = l0b[L0BUF_SIZE / 2 / sizeof(B_T)]; + } + } + // load L0A + uint64_t aPoskPtr = k_inner * kC0 * GetHwK0() + sAL1KOffset_; + LoadL12L0ACache(posA, k_inner, aPoskPtr, sMad0K_, l1AMatrix, l0a); + ++posA; + // load L0B + LoadL12L0BCache(posB, k_inner, offsetb, sMad0K_, l1BMatrix, l0b); + ++posB; + SetFlag(ssAl0PingPongFlag_); + WaitFlag(ssAl0PingPongFlag_); + // MAD + bool l0c_initial = (k_inner == 0) && (sL0cInit_); + uint8_t unitFlag = 0; + if constexpr (UNIFLAG_EN) { + if constexpr (intraBlockPartSum) { + if (subIdx == 1) { + unitFlag = ((k_inner == (kLoop - 1)) && (sL0cLast_) && (kTail == 0)) ? 3 : 2; + } + } else { + unitFlag = ((k_inner == (kLoop - 1)) && (sL0cLast_) && (kTail == 0)) ? 3 : 2; + } + } + MmadMacro(l0a, l0b, cMatrix, sMad0K_, unitFlag, l0c_initial); + SetFlag(ssAl0PingPongFlag_); + if constexpr (!noBias) { + if ((biasType_) && (l0c_initial) && (ssBiasFull_ == 0)) { + SetFlag(2); + } + } + } + // k tail + if constexpr (!noTail) { + if (kTail != 0) { + + l0a = l0aBuf_.Get(); + l0b = l0bBuf_.Get(); + uint16_t tailK = kC0Tail * GetHwK0(); + uint64_t aPoskPtr = kLoop * kC0 * GetHwK0() + sAL1KOffset_; + // load L0A + LoadL12L0ACache(posA, kLoop, aPoskPtr, tailK, l1AMatrix, l0a); + // load L0B + LoadL12L0BCache(posB, kLoop, offsetb, tailK, l1BMatrix, l0b); + SetFlag(EVENT_ID0); + WaitFlag(EVENT_ID0); + // MAD + bool l0c_initial = (kLoop == 0) && (sL0cInit_); + uint8_t unitFlag = 0; + if constexpr (UNIFLAG_EN) { + if constexpr (intraBlockPartSum) { + if (subIdx == 1) { + unitFlag = ((sL0cLast_)) ? 3 : 2; + } + } else { + unitFlag = sL0cLast_ ? 3 : 2; + } + } + MmadMacro(l0a, l0b, cMatrix, kTail, unitFlag, l0c_initial); + SetFlag(ssAl0PingPongFlag_); + if constexpr (!noBias) { + if ((biasType_) && (l0c_initial) && (ssBiasFull_ == 0)) { + SetFlag(2); + } + } + } + } + if constexpr (!noBias) { + if ((biasType_) && (sL0cLast_)) { + ssBiasPingPongFlag_ += 1 - ssBiasFull_; + } + } + +} + +template +template +inline __aicore__ void MacroMatmul>::ComputeWithMdb( + const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, + const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, uint16_t sMadMStep) +{ + if (ssAmatrixTranspose_ > 0) { + if (mode_ == F322F32) { + kDirectionAlign_ = 1; + } + uint16_t wAlign = CeilAlign(sAL1K_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } else { + // fmatrix w should be 16 aligned + uint16_t wAlign = CeilAlign(sAL1M_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } + + if (ssBmatrixTranspose_ < 1) { + uint16_t wAlign = CeilAlign(sBL1K_, HW_M0); + Load3DSetFMatrixBCal(sFmH_, wAlign, padList); + } + + uint16_t usedK = sMad0K_; + uint16_t mmadK = sMad0K_; + // tail k + if (kTail != 0) { + usedK = kC0Tail * GetHwK0(); + mmadK = kTail; + } + + if constexpr (!noBias) { + if ((biasType_) && (sL0cInit_) && (ssBiasFull_ == 0)) { + WaitFlag(2); + uint16_t lenBurst = (sMadN_ * biasType_ * 2 + 63) / 64; + LocalTensor biasC2; + biasC2 = biasBuf_.Get(); + DataCopy(biasC2, bias[sL1BiasOffset_ * biasType_ * 2], {1, lenBurst, 0, 0}); + SetFlag(2); + WaitFlag(2); + } + } + + LocalTensor l0a; + LocalTensor l0b; + l0b = l0bBuf_.Get(); + // load L0B + LoadL12L0B(0, usedK, l1BMatrix, l0b); + + uint64_t mLoop = sMadMStep / sMadM_; + uint64_t mTail = sMadMStep - mLoop * sMadM_; + // m tail -> mLoop = 1 + mLoop = (mTail == 0) ? mLoop : 1; + + uint32_t l0cOffset = 0; + for (uint64_t m_inner = 0; m_inner < mLoop; m_inner++) { + l0a = l0aBuf_.Get(); + if (mLoop > 1 && (ssAl0PingPongFlag_ & 0x1) != 0) { + if constexpr (IsSameType::value) { + l0a = l0a[L0AUF_SIZE / sizeof(A_T)]; + } else { + l0a = l0a[L0AUF_SIZE / 2 / sizeof(A_T)]; + } + sAL1MOffset_ += sMadM_; + l0cOffset = CeilAlign(sMadM_, HW_M0) * CeilAlign(sMadN_, HW_N0); + } + WaitFlag(ssAl0PingPongFlag_ & 0x1); + // load L0A + LoadL12L0A(0, sAL1KOffset_, usedK, l1AMatrix, l0a); + SetFlag(ssAl0PingPongFlag_ & 0x1); + WaitFlag(ssAl0PingPongFlag_ & 0x1); + // MAD + bool l0c_initial = sL0cInit_; + uint8_t unitFlag = 0; + if (UNIFLAG_EN) { + unitFlag = (sL0cLast_) ? 3 : 2; + } + MmadMacro(l0a, l0b, cMatrix[l0cOffset], mmadK, unitFlag, l0c_initial); + SetFlag(ssAl0PingPongFlag_ & 0x1); + if constexpr (!noBias) { + if ((biasType_) && (l0c_initial) && (ssBiasFull_ == 0)) { + SetFlag(2); + } + } + // update pingpong flag + ssAl0PingPongFlag_ += useL0PingPong_; + ssBl0PingPongFlag_ += useL0PingPong_; + } + if constexpr (!noBias) { + if ((biasType_) && (sL0cLast_)) { + ssBiasPingPongFlag_ += 1 - ssBiasFull_; + } + } +} + +template +template +inline __aicore__ void MacroMatmul>::ComputeWithNdb( + const LocalTensor &l1AMatrix, const LocalTensor &l1BMatrix, const LocalTensor &cMatrix, + const LocalTensor &bias, uint64_t kC0Tail, uint64_t kTail, uint16_t sMadNStep) +{ + if (ssAmatrixTranspose_ > 0) { + if (mode_ == F322F32) { + kDirectionAlign_ = 1; + } + uint16_t wAlign = CeilAlign(sAL1K_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } else { + // fmatrix w should be 16 aligned + uint16_t wAlign = CeilAlign(sAL1M_, HW_M0); + Load3DSetFMatrixCal(sFmH_, wAlign, padList); + } + + if (ssBmatrixTranspose_ < 1) { + uint16_t wAlign = CeilAlign(sBL1K_, HW_M0); + Load3DSetFMatrixBCal(sFmH_, wAlign, padList); + } + + uint16_t usedK = sMad0K_; + uint16_t mmadK = sMad0K_; + // tail k + if (kTail != 0) { + usedK = kC0Tail * GetHwK0(); + mmadK = kTail; + } + + LocalTensor l0a; + LocalTensor l0b; + l0a = l0aBuf_.Get(); + // load L0A + LoadL12L0A(0, sAL1KOffset_, usedK, l1AMatrix, l0a); + + uint64_t nLoop = sMadNStep / sMadN_; + uint64_t nTail = sMadNStep - nLoop * sMadN_; + // n tail -> nLoop = 1 + nLoop = (nTail == 0) ? nLoop : 1; + + uint32_t l0cOffset = 0; + uint32_t biasOffset = 0; + for (uint64_t n_inner = 0; n_inner < nLoop; n_inner++) { + l0b = l0bBuf_.Get(); + if (nLoop > 1 && (ssAl0PingPongFlag_ & 0x1) != 0) { + if constexpr (IsSameType::value) { + l0b = l0b[L0BUF_SIZE / sizeof(B_T)]; + } else { + l0b = l0b[L0BUF_SIZE / 2 / sizeof(B_T)]; + } + sBL1NOffset_ += sMadN_; + biasOffset += sMadN_; + l0cOffset = CeilAlign(sMadM_, HW_M0) * CeilAlign(sMadN_, HW_N0); + } + + WaitFlag(ssAl0PingPongFlag_ & 0x1); + // load L0B + LoadL12L0B(0, usedK, l1BMatrix, l0b); + SetFlag(ssAl0PingPongFlag_ & 0x1); + WaitFlag(ssAl0PingPongFlag_ & 0x1); + // MAD + bool l0c_initial = sL0cInit_; + // load bias + if constexpr (!noBias) { + if ((biasType_) && (sL0cInit_) && (ssBiasFull_ == 0)) { + WaitFlag(2); + uint16_t lenBurst = (sMadN_ * biasType_ * 2 + 63) / 64; + LocalTensor biasC2; + biasC2 = biasBuf_.Get(); + DataCopy(biasC2, bias[sL1BiasOffset_ * biasType_ * 2 + biasOffset], {1, lenBurst, 0, 0}); + SetFlag(2); + WaitFlag(2); + } + } + + uint8_t unitFlag = 0; + if (UNIFLAG_EN) { + unitFlag = sL0cLast_ ? 3 : 2; + } + MmadMacro(l0a, l0b, cMatrix[l0cOffset], mmadK, unitFlag, l0c_initial); + SetFlag(ssAl0PingPongFlag_ & 0x1); + if constexpr (!noBias) { + if ((biasType_) && (l0c_initial) && (ssBiasFull_ == 0)) { + SetFlag(2); + } + } + // update pingpong flag + ssAl0PingPongFlag_ += useL0PingPong_; + ssBl0PingPongFlag_ += useL0PingPong_; + } + if constexpr (!noBias) { + if ((biasType_) && (sL0cLast_)) { + ssBiasPingPongFlag_ += 1 - ssBiasFull_; + } + } +} + +} // namespace matmul +#endif \ No newline at end of file diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h index ccd6aea08c5c5417139a3c811e657dff3941c0a5..36b4a06ad0e974b51676741a8d1203069e10adf0 100644 --- a/impl/matmul/matmul_server.h +++ b/impl/matmul/matmul_server.h @@ -18,7 +18,7 @@ #include "../../lib/matmul/matmul.h" #include "kernel_operator.h" -namespace matmul { +namespace Gemm { constexpr uint16_t WORKSPACE_SYNC_ID = 15; using namespace AscendC; template struct IBShareCache { @@ -34,7 +34,7 @@ struct IBShareCache { template <> struct IBShareCache { __aicore__ inline IBShareCache() {}; - using ShareCache = GlobalCache; + using ShareCache = Impl::Detail::GlobalCache; }; template __aicore__ constexpr bool IsIBShare() { @@ -889,11 +889,11 @@ private: template __aicore__ inline constexpr bool IsSharedMatmul() { - return !matmul::ToMatmulConfig(MM_CFG).enableInit; + return !Gemm::ToMatmulConfig(MM_CFG).enableInit; } template , - MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)> + const auto& MM_CFG = CFG_NORM, class MM_CB = Gemm::MatmulCallBackFunc, + MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> struct MatmulInstBase { __aicore__ inline MatmulInstBase(){}; }; @@ -901,13 +901,13 @@ template struct MatmulInstShared : MatmulInstBase { __aicore__ inline MatmulInstShared(){}; - matmul::MatmulService cubeObj[1]; + Gemm::MatmulService cubeObj[1]; }; template struct MatmulInst : MatmulInstBase { __aicore__ inline MatmulInst(){}; - matmul::MatmulService cubeObj[MIX_NUM]; + Gemm::MatmulService cubeObj[MIX_NUM]; }; template , template, MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)> +class MM_CB = Gemm::MatmulCallBackFunc, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> class MatmulServiceAuxBase { using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; @@ -1062,5 +1062,9 @@ class MatmulServiceAux>>>>>> 5b5aaa4 (!216 rename matmul namespace) #endif // __MATMUL_SERVER_H__ \ No newline at end of file diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h index e46937b600716df88a86abc86b4ba5b5d8021d3c..209591cc0df2e71abeee582a64729d4de608fb0c 100644 --- a/impl/matmul/matmul_utils.h +++ b/impl/matmul/matmul_utils.h @@ -11,8 +11,7 @@ #ifndef IMPL_MATMUL_MATMUL_UTILS_H #define IMPL_MATMUL_MATMUL_UTILS_H - -namespace matmul { +namespace Gemm { using namespace AscendC; template __aicore__ inline constexpr int32_t GetC0Size() @@ -678,5 +677,6 @@ __aicore__ inline T CeilAlign(T num1, T num2) ASSERT(num2 > 0); return Ceil(num1, num2) * num2; } -} // namespace matmul -#endif \ No newline at end of file + +} // namespace Gemm +#endif diff --git a/impl/matmul/modules/context/context.h b/impl/matmul/modules/context/context.h index b0ae83aa72267564734d659e8c31cd63d022a566..bc4ae71b0ced2511b8a572d0644c6244040627ff 100644 --- a/impl/matmul/modules/context/context.h +++ b/impl/matmul/modules/context/context.h @@ -1,30 +1,39 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file context.h - * \brief - */ - - -#ifndef IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H -#define IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H - -namespace matmul { -template -class MatmulContext -{ -public: - __aicore__ inline MatmulContext() = default; - __aicore__ inline ~MatmulContext() = default; -}; - -} +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file context.h + * \brief + */ + + +#ifndef IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H +#define IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + MatmulContext is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + MatmulContext is only for internal usage, does not support extension or customized specialization! +*/ +template +class MatmulContext +{ +public: + __aicore__ inline MatmulContext() = default; + __aicore__ inline ~MatmulContext() = default; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif //IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H \ No newline at end of file diff --git a/impl/matmul/modules/dfx/dfx_config.h b/impl/matmul/modules/dfx/dfx_config.h index 2d77bc8282933b0f4b6392e17c75325a5629ef5a..33b0efdc10298a237cccb15c698fe63cde06dde5 100644 --- a/impl/matmul/modules/dfx/dfx_config.h +++ b/impl/matmul/modules/dfx/dfx_config.h @@ -19,10 +19,14 @@ #include "handlers/dfx_chain_handler.h" #include "dfx_func_info.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { struct DfxConfig { static constexpr bool ENABLE = false; using EnabledHandlers = DfxChainHandler <>; }; -} +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/dfx/dfx_func_info.h b/impl/matmul/modules/dfx/dfx_func_info.h index be4e9c273b629f7bf93e4e2a315beaee831e29fe..55cc0e852bd242d2686482c06b4445a516f03cd6 100644 --- a/impl/matmul/modules/dfx/dfx_func_info.h +++ b/impl/matmul/modules/dfx/dfx_func_info.h @@ -16,7 +16,9 @@ #ifndef MATMUL_DFX_FUNC_INFO_H #define MATMUL_DFX_FUNC_INFO_H -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { struct DfxFuncInfo { __aicore__ inline DfxFuncInfo(__gm__ const char* module, __gm__ const char* func, uint32_t funcId) :module(module), func(func), funcId(funcId) { @@ -25,5 +27,7 @@ struct DfxFuncInfo { __gm__ const char* func; uint32_t funcId; }; -} +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/dfx/dfx_handler.h b/impl/matmul/modules/dfx/dfx_handler.h index 81772575226d8c8b73d375196229c110b952bf23..92ac062b108c5e848546d0bd8b6c30423c39cf03 100644 --- a/impl/matmul/modules/dfx/dfx_handler.h +++ b/impl/matmul/modules/dfx/dfx_handler.h @@ -18,7 +18,9 @@ #include "dfx_config.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { struct DfxHandler { template @@ -36,6 +38,7 @@ struct DfxHandler { } }; -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/dfx/dfx_proxy.h b/impl/matmul/modules/dfx/dfx_proxy.h index 860a5bd5a5d8f8d72b14e452e1f10092b432fcbf..855da1b8b6385c7a7cad6dd0ab718385215f148e 100644 --- a/impl/matmul/modules/dfx/dfx_proxy.h +++ b/impl/matmul/modules/dfx/dfx_proxy.h @@ -19,11 +19,14 @@ #include #include "dfx_handler.h" -namespace matmul { +namespace Gemm { template using enable_if_t = typename std::enable_if::type; +namespace Impl { +namespace Detail { + template constexpr bool is_void_v = std::is_void::value; @@ -167,6 +170,7 @@ private: \ FuncProxy proxy; \ } -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/dfx/dfx_registry.h b/impl/matmul/modules/dfx/dfx_registry.h index 1f8b9a3f62354798ac9e08db841b51297633e062..9cea2a72b30f81d36c70a1eeea3d87dfcdfc3a65 100644 --- a/impl/matmul/modules/dfx/dfx_registry.h +++ b/impl/matmul/modules/dfx/dfx_registry.h @@ -19,8 +19,11 @@ #include "dfx_proxy.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { MATMUL_DFX_PROXY_REGISTER(InputL1Cache, ClearAL1Cache, ClearBL1Cache); -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h index 1ac51fa1f5b5c34aacafb374139dc9a177111bd0..6be4698bca2d0da859d3b353317814b54439eed3 100644 --- a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h +++ b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h @@ -16,7 +16,9 @@ #ifndef MATMUL_DFX_CHAIN_HANDLER_H #define MATMUL_DFX_CHAIN_HANDLER_H -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { struct DfxFuncInfo; @@ -37,6 +39,7 @@ struct DfxChainHandler { } }; -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/feature_trait/matmul_chip_cap.h b/impl/matmul/modules/feature_trait/matmul_chip_cap.h index f54df544d87c4ac22bef38f9f414481edc5adf40..038c2296a7d27e09747fdee5e32cbd9a319920b9 100644 --- a/impl/matmul/modules/feature_trait/matmul_chip_cap.h +++ b/impl/matmul/modules/feature_trait/matmul_chip_cap.h @@ -1,87 +1,84 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file matmul_chip_cap.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H -#define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H - -namespace matmul { - -enum class CubeOutType: int8_t { - DATACOPY, - FIXPIPE, - NONE -}; - -enum class FixpipeParamsType: int8_t { - V220, - V300, - V310, - NONE -}; - -class MatmulChipCap -{ -public: - struct Feature { - bool supportUnitFlag; - bool ifNeedUB; - bool ifSupportUBToL1; - CubeOutType cubeOutType; - FixpipeParamsType fixpipeParamsType; - }; - - __aicore__ constexpr static const Feature& GetFeatures() - { - return features[GetChipType()]; - } - -private: - enum { - CHIP_TYPE_100, - CHIP_TYPE_200, - CHIP_TYPE_220, - CHIP_TYPE_300, - CHIP_TYPE_310, - CHIP_TYPE_MAX, - }; - - __aicore__ inline constexpr static uint8_t GetChipType() - { - #if __CCE_AICORE__ == 100 - return CHIP_TYPE_100; - #elif __CCE_AICORE__ == 200 - return CHIP_TYPE_200; - #elif __CCE_AICORE__ == 220 - return CHIP_TYPE_220; - #elif __CCE_AICORE__ == 300 - return CHIP_TYPE_300; - #elif __CCE_AICORE__ == 310 - return CHIP_TYPE_310; - #else - return CHIP_TYPE_MAX; - #endif - } - -private: - constexpr static Feature features[CHIP_TYPE_MAX] = { - /* supportUnitFlag, ifNeedUB, ifSupportUBToL1, cubeOutType, fixpipeParamsType */ - /*100*/ {false, true, true, CubeOutType::DATACOPY, FixpipeParamsType::NONE}, - /*200*/ {false, true, true, CubeOutType::DATACOPY, FixpipeParamsType::NONE}, - /*220*/ {true, false, false, CubeOutType::FIXPIPE, FixpipeParamsType::V220}, - /*300*/ {true, false, true, CubeOutType::FIXPIPE, FixpipeParamsType::V300}, - /*310*/ {true, false, false, CubeOutType::FIXPIPE, FixpipeParamsType::V310}}; -}; - -} -#endif // _MATMUL_CHIP_CAP_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! +* \file matmul_chip_cap.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H +#define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H + +namespace Gemm { +namespace Impl { +namespace Detail { + +enum class FixpipeParamsType: int8_t { + V220, + V300, + V310, + NONE +}; + +class MatmulChipCap +{ +public: + struct Feature { + bool supportUnitFlag; + bool ifNeedUB; + bool ifSupportUBToL1; + FixpipeParamsType fixpipeParamsType; + }; + + __aicore__ constexpr static const Feature& GetFeatures() + { + return features[GetChipType()]; + } + +private: + enum { + CHIP_TYPE_100, + CHIP_TYPE_200, + CHIP_TYPE_220, + CHIP_TYPE_300, + CHIP_TYPE_310, + CHIP_TYPE_MAX, + }; + + __aicore__ inline constexpr static uint8_t GetChipType() + { + #if __CCE_AICORE__ == 100 + return CHIP_TYPE_100; + #elif __CCE_AICORE__ == 200 + return CHIP_TYPE_200; + #elif __CCE_AICORE__ == 220 + return CHIP_TYPE_220; + #elif __CCE_AICORE__ == 300 + return CHIP_TYPE_300; + #elif __CCE_AICORE__ == 310 + return CHIP_TYPE_310; + #else + return CHIP_TYPE_MAX; + #endif + } + +private: + constexpr static Feature features[CHIP_TYPE_MAX] = { + /* supportUnitFlag, ifNeedUB, ifSupportUBToL1, fixpipeParamsType */ + /*100*/ {false, true, true, FixpipeParamsType::NONE}, + /*200*/ {false, true, true, FixpipeParamsType::NONE}, + /*220*/ {true, false, false, FixpipeParamsType::V220}, + /*300*/ {true, false, true, FixpipeParamsType::V220}, + /*310*/ {true, false, false, FixpipeParamsType::V310}}; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _MATMUL_CHIP_CAP_H_ diff --git a/impl/matmul/modules/feature_trait/matmul_feature_trait.h b/impl/matmul/modules/feature_trait/matmul_feature_trait.h index 4220b2e7a83cfa621519d29da31d4027bb9fb8aa..25a274389fabcb0b143b5764ee53370ec9d9faea 100644 --- a/impl/matmul/modules/feature_trait/matmul_feature_trait.h +++ b/impl/matmul/modules/feature_trait/matmul_feature_trait.h @@ -1,60 +1,59 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file matmul_feature.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H -#define IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H - -#include "../../matmul_utils.h" -#include "matmul_chip_cap.h" -#include "matmul_iter_ctrl_cfg.h" - -namespace matmul { - -template -class MatmulFeatureTrait { -public: - static constexpr MatmulIterCtrlCfg iterCtrlCfg { - .isFixedStep = DoMatmulSpecialBasicBlock(MM_CFG), - .stepM = ToMatmulConfig(MM_CFG).stepM, - .stepN = ToMatmulConfig(MM_CFG).stepN, - .iterOrder = IterateOrder::UNDEF, - }; - - __aicore__ inline constexpr static bool IsUnitFlagEnabled() - { - return EnUnitFlag(MM_CFG) && MatmulChipCap::GetFeatures().supportUnitFlag; - } - - __aicore__ inline constexpr static bool IsNeedUB() - { - return MatmulChipCap::GetFeatures().ifNeedUB; - } - - __aicore__ inline constexpr static bool IsSupportUBToL1() - { - return MatmulChipCap::GetFeatures().ifSupportUBToL1; - } - - __aicore__ inline constexpr static CubeOutType GetCubeOutType() - { - return MatmulChipCap::GetFeatures().cubeOutType; - } - - __aicore__ inline constexpr static FixpipeParamsType GetFixpipeParamsType() - { - return MatmulChipCap::GetFeatures().fixpipeParamsType; - } -}; -} -#endif // _MATMUL_FEATURE_TRAIT_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_feature.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H +#define IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H + +#include "../../matmul_utils.h" +#include "matmul_chip_cap.h" +#include "matmul_iter_ctrl_cfg.h" + +namespace Gemm { +namespace Impl { +namespace Detail { + +template +class MatmulFeatureTrait { +public: + static constexpr MatmulIterCtrlCfg iterCtrlCfg { + .isFixedStep = DoMatmulSpecialBasicBlock(MM_CFG), + .stepM = ToMatmulConfig(MM_CFG).stepM, + .stepN = ToMatmulConfig(MM_CFG).stepN, + .iterOrder = IterateOrder::UNDEF, + }; + + __aicore__ inline constexpr static bool IsUnitFlagEnabled() + { + return EnUnitFlag(MM_CFG) && MatmulChipCap::GetFeatures().supportUnitFlag; + } + + __aicore__ inline constexpr static bool IsNeedUB() + { + return MatmulChipCap::GetFeatures().ifNeedUB; + } + + __aicore__ inline constexpr static bool IsSupportUBToL1() + { + return MatmulChipCap::GetFeatures().ifSupportUBToL1; + } + + __aicore__ inline constexpr static FixpipeParamsType GetFixpipeParamsType() + { + return MatmulChipCap::GetFeatures().fixpipeParamsType; + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _MATMUL_FEATURE_TRAIT_H_ diff --git a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h index e7856fc9cfa81394a5c510ddaa3b51ec7424fac4..25680363198491039e2af16ab038adaf61a8c603 100644 --- a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h +++ b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h @@ -1,31 +1,35 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file matmul_iter_ctrl_cfg.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H -#define IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H - -#include "../../../../lib/matmul/tiling.h" -#include "../../../../lib/matmul/constant_tiling.h" - -namespace matmul { - -struct MatmulIterCtrlCfg { - bool isFixedStep; - int32_t stepM; - int32_t stepN; - IterateOrder iterOrder; -}; - -} +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_iter_ctrl_cfg.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H +#define IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H + +#include "../../../../lib/matmul/tiling.h" +#include "../../../../lib/matmul/constant_tiling.h" + +namespace Gemm { +namespace Impl { +namespace Detail { + +struct MatmulIterCtrlCfg { + bool isFixedStep; + int32_t stepM; + int32_t stepN; + IterateOrder iterOrder; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // _MATMUL_ITER_CTRL_CFG_H_ \ No newline at end of file diff --git a/impl/matmul/modules/iterator/matmul_iterate_controller.h b/impl/matmul/modules/iterator/matmul_iterate_controller.h index 58ba896cf2f0c8640636fd8964a5707fc8d92b78..da4cd7c7fe43f42a1d53c997135fae1de3908846 100644 --- a/impl/matmul/modules/iterator/matmul_iterate_controller.h +++ b/impl/matmul/modules/iterator/matmul_iterate_controller.h @@ -20,7 +20,9 @@ #include "../../matmul_utils.h" #include "../feature_trait/matmul_iter_ctrl_cfg.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template class MatmulIterateController @@ -146,6 +148,7 @@ private: } }; -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif diff --git a/impl/matmul/modules/matmul_local_workspace.h b/impl/matmul/modules/matmul_local_workspace.h new file mode 100644 index 0000000000000000000000000000000000000000..c680ffc89bc67cc4fad26a7dc67cad4feb40c6a8 --- /dev/null +++ b/impl/matmul/modules/matmul_local_workspace.h @@ -0,0 +1,297 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_local_workspace.h +* \brief matmul local workspace manager +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_LOCAL_WORKSPACE_H +#define IMPL_MATMUL_MODULES_MATMUL_LOCAL_WORKSPACE_H + +#include "matmul_module.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +constexpr int32_t ENVEC_UBUNREUSE_COEFF = 2; +constexpr int32_t ENVEC_NZ2ND_COEFF = 3; +constexpr int32_t ENABLE_TRANS_COEFF = 2; +constexpr int32_t UNABLE_TRANS_COEFF = 4; +template +class MatmulLocalWorkspace { + using SrcT = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + +public: + __aicore__ inline MatmulLocalWorkspace() {} + __aicore__ inline ~MatmulLocalWorkspace() {} + + /** + * @description: Init of MatmulLocalWorkspace + * @param: localBuffer: Local address input through SetLocalWorkspace + * @return: void + */ + __aicore__ inline void Init(const LocalTensor& localBuffer) + { + SetWorkspace(localBuffer); + } + + /** + * @description: Get workspace with offset when enVecND2NZ is enable + * @param: offset: the offset of localworkspace + * @return: LocalWorkspace + */ + __aicore__ inline LocalTensor GetND2NZWorkspace(int32_t offset = 0) const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + return localWorkspace_[offset]; + } + + /** + * @description: Get transLength or 3 * transLength workspace when enVecND2NZ is enable + * @return: LocalWorkspace + */ + __aicore__ inline LocalTensor GetNZ2NDWorkspace() const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { + return localWorkspace_[MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() * ENVEC_NZ2ND_COEFF]; + } else { + return localWorkspace_[MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()]; + } + } + + /** + * @description: Get 0 or 2 * transLength workspace when enVecND2NZ is enable + * @return: LocalWorkspace + */ + __aicore__ inline LocalTensor GetCopy2Co2Workspace() const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { + return localWorkspace_[MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() * ENVEC_UBUNREUSE_COEFF]; + } else { + return localWorkspace_[0]; + } + } + + /** + * @description: Get 0 or 2 * transLength workspace with offset when enVecND2NZ is enable + * @param: offset: the offset of localworkspace + * @param: enableUBReuse: the flag of enable UB reuse + * @return: LocalWorkspace + */ + template + __aicore__ inline LocalTensor GetWorkspaceWithOffset(int32_t offset = 0) const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + if constexpr (!enableUBReuse) { + return localWorkspace_[MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() * ENVEC_UBUNREUSE_COEFF + offset]; + } else { + return localWorkspace_[offset]; + } + } + +private: + __aicore__ inline void SetWorkspace(const LocalTensor& localBuffer) + { + localWorkspace_ = localBuffer; + __ubuf__ uint8_t *addr = (__ubuf__ uint8_t *)localBuffer.GetPhyAddr(); + ASCENDC_ASSERT((addr != nullptr), { KERNEL_LOG(KERNEL_ERROR, "addr can not be nullptr"); }); + + int32_t totalTransLen = 0; + if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND || !PhyPosIsUB(C_TYPE::pos)) { + if constexpr (ToMatmulConfig(MM_CFG).enableUBReuse) { + totalTransLen = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() * ENABLE_TRANS_COEFF; + } else { + totalTransLen = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() * UNABLE_TRANS_COEFF; + } + } + + int32_t biasLen = 0; + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() && BIAS_TYPE::pos != TPosition::VECCALC) { + biasLen = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * sizeof(BiasT); + } + + ASSERT(localBuffer.GetSize() >= (totalTransLen > biasLen ? totalTransLen : biasLen)); + } + +private: + LocalTensor localWorkspace_; +}; + +template +class MatmulLocalWorkspace> { + using SrcT = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + +public: + __aicore__ inline MatmulLocalWorkspace() {} + __aicore__ inline ~MatmulLocalWorkspace() {} + + /** + * @description: Init of MatmulLocalWorkspace + * @param: localBuffer: Local address input through SetLocalWorkspace + * @return: void + */ + __aicore__ inline void Init(const LocalTensor& localBuffer) + { + SetWorkspace(localBuffer); + } + + /** + * @description: Get workspace with nd2nzOffset + offset when enVecND2NZ is unable + * @param: offset: the offset of localworkspace + * @param: enableUBReuse: the flag of enable UB reuse + * @return: LocalWorkspace + */ + template + __aicore__ inline LocalTensor GetND2NZWorkspace(int32_t offset = 0) const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + return localWorkspace_[nd2nzOffset_ + offset]; + } + + /** + * @description: Get workspace with nz2ndOffset when enVecND2NZ is unable + * @return: LocalWorkspace + */ + __aicore__ inline LocalTensor GetNZ2NDWorkspace() const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + return localWorkspace_[nz2ndOffset_]; + } + + /** + * @description: Get workspace with co2Offset when enVecND2NZ is unable + * @return: LocalWorkspace + */ + __aicore__ inline LocalTensor GetCopy2Co2Workspace() const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + return localWorkspace_[co2Offset_]; + } + + /** + * @description: Get workspace with offset when enVecND2NZ is unable + * @param: offset: the offset of localworkspace + * @param: enableUBReuse: the flag of enable UB reuse + * @return: LocalWorkspace + */ + template + __aicore__ inline LocalTensor GetWorkspaceWithOffset(int32_t offset = 0) const + { + ASCENDC_ASSERT(((__ubuf__ uint8_t *)localWorkspace_.GetPhyAddr() != nullptr), + { KERNEL_LOG(KERNEL_ERROR, "Ub workspace is nullptr, which should be given."); }); + return localWorkspace_[offset]; + } + +private: + __aicore__ inline int32_t GetAUsedSpace() + { + constexpr int32_t c0Size = AuxGetC0Size(); + if constexpr (A_TYPE::pos != TPosition::TSCM) { + if (!MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA() && (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK() % c0Size != 0)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * ONE_BLK_SIZE; + } else if (MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA() && (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreM() % c0Size != 0)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * ONE_BLK_SIZE; + } + } + return 0; + } + + __aicore__ inline int32_t GetBUsedSpace() + { + constexpr int32_t c0Size = AuxGetC0Size(); + constexpr int32_t bUsedSize = 2; + if constexpr (B_TYPE::pos != TPosition::TSCM) { + if (IsSameType::value && IsSameType::value && + !B_TYPE::isTrans && B_TYPE::format == CubeFormat::ND) { + if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * bUsedSize; + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * bUsedSize; + } + } else { + if (!MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB() && (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() % c0Size != 0)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * ONE_BLK_SIZE; + } else if (MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB() && (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK() % c0Size != 0)) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * ONE_BLK_SIZE; + } + } + } + return 0; + } + + __aicore__ inline void SetWorkspace(const LocalTensor& localBuffer) + { + localWorkspace_ = localBuffer; + __ubuf__ uint8_t *addr = (__ubuf__ uint8_t *)localBuffer.GetPhyAddr(); + ASCENDC_ASSERT((addr != nullptr), { KERNEL_LOG(KERNEL_ERROR, "addr can not be nullptr"); }); + + int32_t len = 0; + + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().IsBias() && BIAS_TYPE::pos != TPosition::VECCALC) { + len += MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * sizeof(BiasT); + } + + if constexpr (C_TYPE::pos == TPosition::GM || + (C_TYPE::pos == TPosition::VECCALC && C_TYPE::format != CubeFormat::NZ)) { + co2Offset_ = len; + len += MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * sizeof(DstT); + } + if constexpr (C_TYPE::pos == TPosition::GM) { + constexpr int32_t blockCount = ONE_BLK_SIZE / sizeof(DstT); + if (C_TYPE::format == CubeFormat::ND && MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN() % blockCount != 0) { + nz2ndOffset_ = len; + len += ONE_BLK_SIZE; + } + } + + if constexpr (A_TYPE::format == CubeFormat::ND || B_TYPE::format == CubeFormat::ND) { + nd2nzOffset_ = len; + int32_t aTmp = GetAUsedSpace(); + int32_t bTmp = GetBUsedSpace(); + len += (aTmp >= bTmp) ? aTmp : bTmp; + } + + ASSERT(localBuffer.GetSize() >= len); + } + +private: + LocalTensor localWorkspace_; + int32_t nd2nzOffset_ = 0; + int32_t nz2ndOffset_ = 0; + int32_t co2Offset_ = 0; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_MATMUL_LOCAL_WORKSPACE_H \ No newline at end of file diff --git a/impl/matmul/modules/matmul_module.h b/impl/matmul/modules/matmul_module.h index bc708020a3b2544b169140302b76313130dbcc95..05fe771e3dfff1c76cf013ef7da18e8a42f65013 100644 --- a/impl/matmul/modules/matmul_module.h +++ b/impl/matmul/modules/matmul_module.h @@ -19,12 +19,14 @@ #include "dfx/dfx_config.h" /* MatmulModuleBase */ -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template using void_t = void; // if user define self-implement module, but inherited from base module implemented in matmul, -// child module shoud declare : using BASE_MODULE = matmul::XXXModuleName; +// child module shoud declare : using BASE_MODULE = Gemm::XXXModuleName; struct MatmulNullBase {}; template @@ -37,9 +39,10 @@ struct MatmulModuleBase> { using type = typename MODULE::BASE_MODULE; }; -} - -/* MatmulImpl */ +} // namespace Detail +} // namespace Impl +} // namespace Gemm +/* MatmulImplBase */ #define MATMUL_IMPL__ IMPL #define MATMUL_POLICY__ POLICY @@ -53,23 +56,23 @@ struct MatmulModuleBase> { (static_cast(MATMUL_CAST_TO_CONST_IMPL())) #define MATMUL_CAST_TO_PROXY_OF(NAME) \ -typename matmul::DfxProxy (*MATMUL_CAST_TO_IMPL_OF(NAME)) +typename Gemm::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_IMPL_OF(NAME)) #define MATMUL_CAST_TO_CONST_PROXY_OF(NAME) \ -typename matmul::DfxProxy (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME)) +typename Gemm::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME)) #define MATMUL_MODULE(NAME) cast_to_##NAME() #define MATMUL_USE_MODULE(NAME) \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) { \ - if constexpr (DfxConfig::ENABLE) { \ + if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_PROXY_OF(NAME); \ } else { \ return MATMUL_CAST_TO_IMPL_OF(NAME); \ } \ } \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ - if constexpr (DfxConfig::ENABLE) { \ + if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_CONST_PROXY_OF(NAME); \ } else { \ return MATMUL_CAST_TO_CONST_IMPL_OF(NAME); \ @@ -78,14 +81,14 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ #define MATMUL_USE_MODULE_ON(NAME, ...) \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) { \ - if constexpr (DfxConfig::ENABLE) { \ + if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_PROXY_OF(template NAME<__VA_ARGS__>); \ } else { \ return MATMUL_CAST_TO_IMPL_OF(template NAME<__VA_ARGS__>); \ } \ } \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ - if constexpr (DfxConfig::ENABLE) { \ + if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_CONST_PROXY_OF(template NAME<__VA_ARGS__>);\ } else { \ return MATMUL_CAST_TO_CONST_IMPL_OF(template NAME<__VA_ARGS__>); \ @@ -97,7 +100,7 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ #define MATMUL_POLICY_DEFAULT_OF(DEFAULT) \ template \ - class MATMUL_POLICY = DEFAULT + class MATMUL_POLICY = Gemm::Impl::Detail::DEFAULT #define MATMUL_POLICY_TEMPLATE_OF(NAME) \ template class NAME @@ -115,14 +118,14 @@ MATMUL_POLICY_TEMPLATE::type; \ +friend typename Gemm::Impl::Detail::MatmulModuleBase::type; \ friend NAME #define MATMUL_ALLOW_USING_TEMPLATE(NAME, ...) \ using NAME = typename MATMUL_MODULE_IN_POLICY(template NAME<__VA_ARGS__>) /* Matmul Private Module */ -#define MATMUL_PRIVATE_TEMPLATE MatmulPrivateModules +#define MATMUL_PRIVATE_TEMPLATE Gemm::Impl::Detail::MatmulPrivateModules #define MATMUL_MODULE_IN_PRIVATE(...) \ MATMUL_PRIVATE_TEMPLATE::__VA_ARGS__ diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h index 9357667665d3ce451ec6d26abdd0b03c394f7571..64ccff8c3ad3dbf39c0fbfa5f8be4a9f284e22d1 100644 --- a/impl/matmul/modules/matmul_param.h +++ b/impl/matmul/modules/matmul_param.h @@ -24,7 +24,9 @@ #include "matmul_type_def.h" #include "resource/cube_in_buffer/global_cache.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { /* ************************************************************************************************** * MatmulParamsBase * * ************************************************************************************************* */ @@ -466,6 +468,7 @@ struct MatmulParams; }; -} - -#endif \ No newline at end of file +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif diff --git a/impl/matmul/modules/matmul_policy.h b/impl/matmul/modules/matmul_policy.h index 994ae8c4759da90fbcbef4e8cedd5abe9a56247e..139dfb82f82ff57f9b2b72d77fc3c931b1c38f57 100644 --- a/impl/matmul/modules/matmul_policy.h +++ b/impl/matmul/modules/matmul_policy.h @@ -21,22 +21,28 @@ #include "stage/copy_cube_in/copy_cube_in.h" #include "context/context.h" -namespace matmul { - -template +namespace Gemm { +namespace Impl { +namespace Detail { +/* + MatmulPolicy is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + MatmulPolicy is only for internal usage, does not support extension or customized specialization! +*/ +template struct MatmulPolicy { public: using L0cT = typename GetDstType::Type; - using Context = matmul::MatmulContext; - using CubeOutBuffer = matmul::CubeOutBuffer; - using CopyCubeOut = matmul::CopyCubeOut::GetCubeOutType()>; - using CopyCubeInA = CopyCubeIn, MM_CFG>; + using Context = MatmulContext; + using CubeOutBuffer = Gemm::Impl::Detail::CubeOutBuffer; + using CopyCubeOut = Gemm::Impl::Detail::CopyCubeOut; + using CopyCubeInA = Gemm::Impl::Detail::CopyCubeIn, MM_CFG>; using CopyCubeInB = CopyCubeIn, MM_CFG>; using CubeInBufferA = CubeInBuffer, MM_CFG>; using CubeInBufferB = CubeInBuffer, MM_CFG>; }; -} -#endif // _MATMUL_POLICY_H_ \ No newline at end of file +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _MATMUL_POLICY_H_ diff --git a/impl/matmul/modules/matmul_private_modules.h b/impl/matmul/modules/matmul_private_modules.h index 1ce6000cac6138cd48868b06c0714cf7277401d9..63e8e512d4e59cbfaad216c2492194b6492ec075 100644 --- a/impl/matmul/modules/matmul_private_modules.h +++ b/impl/matmul/modules/matmul_private_modules.h @@ -29,7 +29,9 @@ #include "matmul_antiquant_processor.h" #include "iterator/matmul_iterate_controller.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template @@ -43,10 +45,10 @@ struct MatmulPrivateModules { using MatmulVarC = MatmulVar>; using MatmulTensorInfoA = MatmulTensorInfo>; using MatmulTensorInfoB = MatmulTensorInfo>; - using MatmulSubBlockInfo = matmul::MatmulSubBlockInfo; - using MatmulShapeTilingA = MatmulShapeTiling>; - using MatmulShapeTilingB = MatmulShapeTiling>; - using MatmulShapeTilingC = MatmulShapeTiling>; + using MatmulSubBlockInfo = Gemm::Impl::Detail::MatmulSubBlockInfo; + using MatmulShapeTilingA = Gemm::Impl::Detail::MatmulShapeTiling>; + using MatmulShapeTilingB = Gemm::Impl::Detail::MatmulShapeTiling>; + using MatmulShapeTilingC = Gemm::Impl::Detail::MatmulShapeTiling>; using MatmulAntiQuantProcessor = matmul::MatmulAntiQuantProcessor>; using DataCopyUtilsA = DataCopyWrapper>; using DataCopyUtilsB = DataCopyWrapper>; @@ -58,10 +60,10 @@ struct MatmulPrivateModules { using BatchCopyCubeInA = BatchCopyCubeIn, MM_CFG>; using BatchCopyCubeInB = BatchCopyCubeIn, MM_CFG>; using IterateController = - matmul::MatmulIterateController::iterCtrlCfg>; + Gemm::Impl::Detail::::MatmulIterateController::iterCtrlCfg>; using MatmulShapeInfoA = MatmulShapeInfo>; using MatmulShapeInfoB = MatmulShapeInfo>; using MatmulShapeInfoC = MatmulShapeInfo>; }; } -#endif // _MATMUL_PRIVATE_MODULES_H_ \ No newline at end of file +#endif // _MATMUL_PRIVATE_MODULES_H_ diff --git a/impl/matmul/modules/matmul_subblock_info.h b/impl/matmul/modules/matmul_subblock_info.h index 4922616b690a6dff46063826bb9bd24e2072b25c..39937c9e606ca32106e5447d9aded6202244177b 100644 --- a/impl/matmul/modules/matmul_subblock_info.h +++ b/impl/matmul/modules/matmul_subblock_info.h @@ -18,7 +18,9 @@ #include "matmul_module.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template class MatmulSubBlockInfo { public: @@ -32,5 +34,7 @@ public: return MATMUL_CONST_INTRA_BLOCK.fakeMsg || MATMUL_CONST_PARAM_VAR.subBlockIdx_ == 0; } }; -} // namespace matmul -#endif // IMPL_MATMUL_MODULES_MATMUL_SUBBLOCK_INFO_H \ No newline at end of file +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SUBBLOCK_INFO_H diff --git a/impl/matmul/modules/matmul_type_def.h b/impl/matmul/modules/matmul_type_def.h index ac26dc7072c1e52d331806fb9170667f2560f5b7..8957c61aaa1f78ee80f47a43de7eb55eb1ffb666 100644 --- a/impl/matmul/modules/matmul_type_def.h +++ b/impl/matmul/modules/matmul_type_def.h @@ -1,55 +1,55 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file matmul_type_def.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H -#define IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H - -#include "lib/matmul/tiling.h" - -namespace matmul { -enum class InputTypeTag : uint8_t { - A = 0, - B = 1, - C = 2, -}; -template -struct MatmulType { - constexpr static TPosition pos = POSITION; - constexpr static CubeFormat format = FORMAT; - using T = TYPE; - constexpr static bool isTrans = ISTRANS; - constexpr static LayoutMode layout = LAYOUT; - constexpr static bool ibShare = IBSHARE; -}; - -template -struct MatmulInputAType : INPUT_TYPE { - using TRANS_T = TRANS_TYPE; - constexpr static InputTypeTag TAG = InputTypeTag::A; -}; - -template -struct MatmulInputBType : INPUT_TYPE { - using TRANS_T = TRANS_TYPE; - constexpr static InputTypeTag TAG = InputTypeTag::B; -}; - -template -struct MatmulInputCType : INPUT_TYPE { - using TRANS_T = TRANS_TYPE; - constexpr static InputTypeTag TAG = InputTypeTag::C; -}; -} +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_type_def.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H +#define IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H + +#include "lib/matmul/tiling.h" + +namespace matmul { +enum class InputTypeTag : uint8_t { + A = 0, + B = 1, + C = 2, +}; +template +struct MatmulType { + constexpr static TPosition pos = POSITION; + constexpr static CubeFormat format = FORMAT; + using T = TYPE; + constexpr static bool isTrans = ISTRANS; + constexpr static LayoutMode layout = LAYOUT; + constexpr static bool ibShare = IBSHARE; +}; + +template +struct MatmulInputAType : INPUT_TYPE { + using TRANS_T = TRANS_TYPE; + constexpr static InputTypeTag TAG = InputTypeTag::A; +}; + +template +struct MatmulInputBType : INPUT_TYPE { + using TRANS_T = TRANS_TYPE; + constexpr static InputTypeTag TAG = InputTypeTag::B; +}; + +template +struct MatmulInputCType : INPUT_TYPE { + using TRANS_T = TRANS_TYPE; + constexpr static InputTypeTag TAG = InputTypeTag::C; +}; +} #endif // _MATMUL_TYPE_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/modules/param/matmul_shape_info.h b/impl/matmul/modules/param/matmul_shape_info.h new file mode 100644 index 0000000000000000000000000000000000000000..91ae26d0567d06afad7b49b8d7a9187d7148fba5 --- /dev/null +++ b/impl/matmul/modules/param/matmul_shape_info.h @@ -0,0 +1,145 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_shape_info.h + * \brief matmul shape info manager + */ + +#ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_INFO_H +#define IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_INFO_H + +#include "../matmul_module.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class MatmulShapeInfo { +public: + template + __aicore__ inline bool IsTransposeA() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.isTransposeA; + } else { + return MATMUL_CONST_PARAM_VAR.isTransposeA_; + } + } + + template + __aicore__ inline bool IsTransposeB() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.isTransposeB; + } else { + return MATMUL_CONST_PARAM_VAR.isTransposeB_; + } + } + + template + __aicore__ inline uint32_t GetOrgM() + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.M; + } else { + return MATMUL_CAST_TO_IMPL()->M_; + } + } + + template + __aicore__ inline uint32_t GetOrgN() + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.N; + } else { + return MATMUL_CAST_TO_IMPL()->N_; + } + } + + template + __aicore__ inline uint32_t GetOrgKa() + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.Ka; + } else { + return MATMUL_CAST_TO_IMPL()->Ka_; + } + } + + template + __aicore__ inline uint32_t GetOrgKb() + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.Kb; + } else { + return MATMUL_CAST_TO_IMPL()->Kb_; + } + } + + template + __aicore__ inline uint32_t GetOrgKc() + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.Kc; + } else { + return MATMUL_CAST_TO_IMPL()->Kc_; + } + } + + template + __aicore__ inline int32_t GetSingleCoreM() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.singleCoreM; + } else { + return MATMUL_CONST_PARAM_VAR.singleCoreM_; + } + } + + template + __aicore__ inline int32_t GetSingleCoreN() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.singleCoreN; + } else { + return MATMUL_CONST_PARAM_VAR.singleCoreN_; + } + } + + template + __aicore__ inline int32_t GetSingleCoreK() const + { + if constexpr (IS_INTRA_BLOCK) { + return MATMUL_CONST_INTRA_BLOCK.singleCoreK; + } else { + return MATMUL_CONST_PARAM_VAR.singleCoreK_; + } + } + + __aicore__ inline uint32_t GetMIter() const + { + return MATMUL_CONST_PARAM_VAR.mIter_; + } + + __aicore__ inline uint32_t GetNIter() const + { + return MATMUL_CONST_PARAM_VAR.nIter_; + } + + __aicore__ inline uint32_t GetKIter() const + { + return MATMUL_CONST_PARAM_VAR.kIter_; + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_INFO_H diff --git a/impl/matmul/modules/param/matmul_shape_tiling.h b/impl/matmul/modules/param/matmul_shape_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..66d2474b16254b04cd943a6d5e82d789cdf6a580 --- /dev/null +++ b/impl/matmul/modules/param/matmul_shape_tiling.h @@ -0,0 +1,35 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_shape_tiling.h + * \brief matmul variable manager + */ + +#ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_TILING_H +#define IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_TILING_H + +#include "../matmul_module.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class MatmulShapeTiling { +public: + __aicore__ inline MatmulTiling GetTiling() const + { + return MATMUL_CONST_PARAM_VAR.tiling_; + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_TILING_H diff --git a/impl/matmul/modules/param/matmul_tensor_info.h b/impl/matmul/modules/param/matmul_tensor_info.h new file mode 100644 index 0000000000000000000000000000000000000000..af9b1dfc77eef8e574271589daf6669233975373 --- /dev/null +++ b/impl/matmul/modules/param/matmul_tensor_info.h @@ -0,0 +1,110 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_tensor_info.h + * \brief matmul variable manager + */ + +#ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_TENSOR_INFO_H +#define IMPL_MATMUL_MODULES_PARAM_MATMUL_TENSOR_INFO_H + +#include "../matmul_module.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class MatmulTensorInfo { + using SrcT = typename INPUT_TYPE::T; +public: + template + __aicore__ inline GlobalTensor GetGlobalTensor() const + { + GlobalTensor globalMatrix; + if constexpr (IS_INTRA_BLOCK) { + globalMatrix.SetGlobalBuffer(MATMUL_CONST_INTRA_BLOCK.aGlobal); + } else { + globalMatrix.SetGlobalBuffer(MATMUL_CONST_PARAM_VAR.aGlobal_); + } + return globalMatrix; + } + + __aicore__ inline LocalTensor GetLocalTensor() const + { + LocalTensor localMatrix; + localMatrix.SetAddr(MATMUL_CONST_PARAM_VAR.leftMatrix_); + return localMatrix; + } + + template + __aicore__ inline void SetGlobalTensor(const GlobalTensor& globalMatrix, bool isTranspose) + { + if constexpr (IS_INTRA_BLOCK) { + MATMUL_INTRA_BLOCK.aGlobal = globalMatrix.address_; + MATMUL_INTRA_BLOCK.isTransposeA = isTranspose; + } else { + MATMUL_PARAM_VAR.aGlobal_ = globalMatrix.address_; + MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; + } + } + + __aicore__ inline void SetLocalTensor(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_PARAM_VAR.leftMatrix_ = localMatrix.address_; + MATMUL_PARAM_VAR.isTransposeA_ = isTranspose; + } +}; + +template +class MatmulTensorInfo> { + using SrcT = typename INPUT_TYPE::T; +public: + template + __aicore__ inline GlobalTensor GetGlobalTensor() const + { + GlobalTensor globalMatrix; + if constexpr (IS_INTRA_BLOCK) { + globalMatrix.SetGlobalBuffer(MATMUL_CONST_INTRA_BLOCK.bGlobal); + } else { + globalMatrix.SetGlobalBuffer(MATMUL_CONST_PARAM_VAR.bGlobal_); + } + return globalMatrix; + } + + __aicore__ inline LocalTensor GetLocalTensor() const + { + LocalTensor localMatrix; + localMatrix.SetAddr(MATMUL_CONST_PARAM_VAR.rightMatrix_); + return localMatrix; + } + + template + __aicore__ inline void SetGlobalTensor(const GlobalTensor& globalMatrix, bool isTranspose) + { + if constexpr (IS_INTRA_BLOCK) { + MATMUL_INTRA_BLOCK.bGlobal = globalMatrix.address_; + MATMUL_INTRA_BLOCK.isTransposeB = isTranspose; + } else { + MATMUL_PARAM_VAR.bGlobal_ = globalMatrix.address_; + MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; + } + } + + __aicore__ inline void SetLocalTensor(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_PARAM_VAR.rightMatrix_ = localMatrix.address_; + MATMUL_PARAM_VAR.isTransposeB_ = isTranspose; + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_TENSOR_INFO_H diff --git a/impl/matmul/modules/param/matmul_usr_define_info.h b/impl/matmul/modules/param/matmul_usr_define_info.h new file mode 100644 index 0000000000000000000000000000000000000000..ef6610d5514b1fedd6fd0b5bdf31fbe33b2a9b2b --- /dev/null +++ b/impl/matmul/modules/param/matmul_usr_define_info.h @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! +* \file matmul_usr_define_info.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ +#define IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class MatmulUserDefineInfo { +public: + __aicore__ inline uint64_t GetSelfDefineData() const + { + return MATMUL_CONST_PARAM_VAR.dataPtr_; + } + + __aicore__ inline uint64_t GetUserDefineInfo() const + { + return MATMUL_CONST_PARAM_VAR.tilingPtr_; + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h index edc612ca7b26f3568e3561ff27a5fc7c76cf9a46..2a4eb1f93d5242a8c74dcbd4ca73c8c0f0ff7587 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h @@ -1,153 +1,249 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file cube_in_buffer_double_buffer.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_BUFFER_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_BUFFER_H - -#include "cube_in_buffer_intf.h" - -namespace matmul { - -template -class CubeInBuffer() == CubeInBufferType::DOUBLE_BUFFER>> { - MATMUL_USE_MODULE_ON(CubeInBufferParams, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) - { - int32_t matrixByteSize = baseBlockSize * GetBitSize() / ONE_BYTE_BIT_SIZE; - int32_t stepSize = MATMUL_MODULE(CubeInBufferParams)->GetTotalCacheNum(); - cacheFactor_ = (stepSize==0) ? 0 : ((cacheNum / stepSize - 1) & 1); - int32_t queDepth = cacheFactor_ == 0 ? SINGLE_QUE : DOUBLE_QUE; - GetTPipePtr()->InitBuffer(qid_, queDepth, matrixByteSize * stepSize + - MATMUL_MODULE(CubeInBufferParams)->GetBankConflictSize()); - } - - __aicore__ inline void Destroy() - { - isCachingPing_ = false; - isCachingPong_ = false; - qid_.FreeAllEvent(); - } - - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for only db version should not be -1."); }); - LocalTensor tensor = qid_.template AllocTensor(); - int32_t cachePos = bufferPos & cacheFactor_; - SetCache(cachePos, tensor); - SetBufferCaching(cachePos, true); - return tensor; - } - - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in FreeTensor for only db version should not be -1."); }); - int32_t cachePos = bufferPos & cacheFactor_; - if (IsBufferCaching(cachePos)) { - qid_.FreeBuffer(GetCache(cachePos)); - SetBufferCaching(cachePos, false); - } - } - - __aicore__ inline void Reset() - { - if (IsBufferCaching(0)) { - qid_.FreeBuffer(GetCache(0)); - SetBufferCaching(0, false); - } - if (IsBufferCaching(1)) { - qid_.FreeBuffer(GetCache(1)); - SetBufferCaching(1, false); - } - } - - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in Hit for only db version should not be -1."); }); - return iterIndex != 0 || IsBufferCaching(bufferPos & cacheFactor_); - } - - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) iterIndex; - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in Hit for only db version should not be -1."); }); - LocalTensor tensor; - tensor.SetAddr(qid_.GetBufferAddr(GetCache(bufferPos & cacheFactor_))); - return tensor; - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - qid_.EnQue(tensor); - } - - __aicore__ inline void DeQue() - { - (void) qid_.DeQue(); - } - - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return MATMUL_MODULE(CubeInBufferParams)->GetCurKPos(curRow, curCol) % - MATMUL_MODULE(CubeInBufferParams)->GetMajorCacheNum(); - } - -private: - __aicore__ inline auto& GetCache(bool isPong) - { - return isPong ? cachePong_ : cachePing_; - } - - __aicore__ inline bool IsBufferCaching(bool isPong) - { - return isPong ? isCachingPong_ : isCachingPing_; - } - - __aicore__ inline void SetCache(bool isPong, const LocalTensor& cacheTensor) - { - if (isPong) { - cachePong_ = cacheTensor.GetBufferHandle(); - } else { - cachePing_ = cacheTensor.GetBufferHandle(); - } - } - - __aicore__ inline void SetBufferCaching(bool isPong, bool isCaching) - { - if (isPong) { - isCachingPong_ = isCaching; - } else { - isCachingPing_ = isCaching; - } - } - -private: - typename CubeInQueType::QUE qid_; - TBufHandle cachePing_; - TBufHandle cachePong_; - int32_t cacheFactor_; - bool isCachingPing_ { false }; - bool isCachingPong_ { false }; -}; - -} -#endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file cube_in_buffer_double_buffer.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_BUFFER_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_BUFFER_H + +#include "cube_in_buffer_intf.h" +#include "../../param/matmul_shape_tiling.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +constexpr int32_t BANK_CONFLICT_SIZE = 512; +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeInBuffer() == CubeInBufferType::DOUBLE_BUFFER>> { + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) + { + int32_t matrixByteSize = baseBlockSize * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t stepSize = GetTotalCacheNum(); + cacheFactor_ = (cacheNum / stepSize - 1) & 1; + int32_t queDepth = cacheFactor_ == 0 ? SINGLE_QUE : DOUBLE_QUE; + GetTPipePtr()->InitBuffer(qid_, queDepth, matrixByteSize * stepSize + GetBankConflictSize()); +#if __CCE_AICORE__ == 200 + if (IsFromUB()) { + eventIDMte3ToMte1_ = static_cast(GetTPipePtr()->AllocEventID()); + } +#endif + } + + __aicore__ inline void Destroy() + { + isCachingPing_ = false; + isCachingPong_ = false; + qid_.FreeAllEvent(); +#if __CCE_AICORE__ == 200 + if (IsFromUB()) { + GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte1_); + } +#endif + } + + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for only db version should not be -1."); }); + LocalTensor tensor = qid_.template AllocTensor(); + int32_t cachePos = bufferPos & cacheFactor_; + SetCache(cachePos, tensor); + SetBufferCaching(cachePos, true); + return tensor; + } + + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in FreeTensor for only db version should not be -1."); }); + int32_t cachePos = bufferPos & cacheFactor_; + if (IsBufferCaching(cachePos)) { + qid_.FreeBuffer(GetCache(cachePos)); + SetBufferCaching(cachePos, false); + } + } + + __aicore__ inline void Reset() + { + if (IsBufferCaching(0)) { + qid_.FreeBuffer(GetCache(0)); + SetBufferCaching(0, false); + } + if (IsBufferCaching(1)) { + qid_.FreeBuffer(GetCache(1)); + SetBufferCaching(1, false); + } + } + + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in Hit for only db version should not be -1."); }); + return iterIndex != 0 || IsBufferCaching(bufferPos & cacheFactor_); + } + + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) iterIndex; + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in Hit for only db version should not be -1."); }); + LocalTensor tensor; + tensor.SetAddr(qid_.GetBufferAddr(GetCache(bufferPos & cacheFactor_))); + return tensor; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { +#if __CCE_AICORE__ == 200 + if (IsFromUB()) { + SetFlag(eventIDMte3ToMte1_); + } else { + qid_.EnQue(tensor); + } +#else + qid_.EnQue(tensor); +#endif + } + + __aicore__ inline void DeQue() + { +#if __CCE_AICORE__ == 200 + if (IsFromUB()) { + WaitFlag(eventIDMte3ToMte1_); + } else { + (void) qid_.DeQue(); + } +#else + (void) qid_.DeQue(); +#endif + } + +private: + __aicore__ inline int32_t GetTotalCacheNum() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } + } + + __aicore__ inline int32_t GetMajorCacheNum() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb(); + } + } + + __aicore__ inline int32_t GetBankConflictSize() + { + if constexpr (MatmulFeatureTrait::IsNeedUB() && ToMatmulConfig(MM_CFG).enVecND2NZ) { + if constexpr (INPUT_TYPE::format == CubeFormat::ND) { + constexpr int32_t c0Size = AuxGetC0Size(); + if constexpr (INPUT_TYPE::isTrans) { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + bool isBankConflict = Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), c0Size) * + ONE_BLK_SIZE % BANK_CONFLICT_SIZE == 0; + return isBankConflict ? MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * c0Size * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * sizeof(typename INPUT_TYPE::TRANS_T) : 0; + } else { + bool isBankConflict = Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), c0Size) * + ONE_BLK_SIZE % BANK_CONFLICT_SIZE == 0; + return isBankConflict ? MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * c0Size * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * sizeof(typename INPUT_TYPE::TRANS_T) : 0; + } + } else { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + bool isBankConflict = Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(), c0Size) * + ONE_BLK_SIZE % BANK_CONFLICT_SIZE == 0; + return isBankConflict ? MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * c0Size * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * sizeof(typename INPUT_TYPE::TRANS_T) : 0; + } else { + bool isBankConflict = Ceil(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), c0Size) * + ONE_BLK_SIZE % BANK_CONFLICT_SIZE == 0; + return isBankConflict ? MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK() * c0Size * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * sizeof(typename INPUT_TYPE::TRANS_T) : 0; + } + } + } else { + return 0; + } + } else { + return 0; + } + } + + __aicore__ inline auto& GetCache(bool isPong) + { + return isPong ? cachePong_ : cachePing_; + } + + __aicore__ inline bool IsBufferCaching(bool isPong) + { + return isPong ? isCachingPong_ : isCachingPing_; + } + + __aicore__ inline void SetCache(bool isPong, const LocalTensor& cacheTensor) + { + if (isPong) { + cachePong_ = cacheTensor.GetBufferHandle(); + } else { + cachePing_ = cacheTensor.GetBufferHandle(); + } + } + + __aicore__ inline void SetBufferCaching(bool isPong, bool isCaching) + { + if (isPong) { + isCachingPong_ = isCaching; + } else { + isCachingPing_ = isCaching; + } + } + + __aicore__ inline bool IsFromUB() + { + return IsSameType::value && + ((INPUT_TYPE::TAG == InputTypeTag::A && MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA()) || + (INPUT_TYPE::TAG == InputTypeTag::B && !MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB())); + } + +private: + typename CubeInQueType::QUE qid_; + TBufHandle cachePing_; + TBufHandle cachePong_; + int32_t cacheFactor_; + bool isCachingPing_ { false }; + bool isCachingPong_ { false }; +#if __CCE_AICORE__ == 200 + event_t eventIDMte3ToMte1_; +#endif +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h index 9accfd9f3599ed6847b9d33c408ea9a1e4eea770..1b8c4b2b49d6f5d27c992ef59a854d6fead6f205 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h @@ -1,138 +1,154 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file cube_in_buffer_double_global_buffer.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H - -#include "cube_in_buffer_intf.h" -#include "global_cache.h" - -namespace matmul { - -template -class CubeInBuffer() == CubeInBufferType::DOUBLE_GLOBAL_BUFFER>> { - MATMUL_USE_MODULE_ON(CubeInBufferParams, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) - { - baseBlockSize_ = baseBlockSize; - groupCache0_.Init(); - groupCache1_.Init(); - int32_t matrixByteSize = baseBlockSize_ * GetBitSize() / ONE_BYTE_BIT_SIZE; - groupCache0_.InitBuffer(matrixByteSize * cacheNum); - groupCache1_.InitBuffer(matrixByteSize * cacheNum); - } - - __aicore__ inline void Destroy() {} - - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for global que version should not be -1."); }); - if (isCache0SameAddr_) { - return groupCache0_.template GetCacheHead()[bufferPos * baseBlockSize_]; - } else if (isCache1SameAddr_) { - return groupCache1_.template GetCacheHead()[bufferPos * baseBlockSize_]; - } else { - GlobalCache* curGroupCache = isCache0_ ? &groupCache0_ : &groupCache1_; - curGroupCache->template SetOrgAddr(inputAddr_); - isCache0_ = !isCache0_; - return curGroupCache->template AllocTensor(); - } - } - - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) - { - (void) bufferPos; - (void) tensor; - } - - __aicore__ inline void Reset() {} - - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - isCache0SameAddr_ = groupCache0_.template Hit(inputAddr_); - isCache1SameAddr_ = groupCache1_.template Hit(inputAddr_); - return (isCache0SameAddr_ && (iterIndex + 1 <= groupCache0_.GetCacheSize())) || - (isCache1SameAddr_ && (iterIndex + 1 <= groupCache1_.GetCacheSize())); - } - - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - if (isCache0SameAddr_) { - return groupCache0_.template GetCacheHead()[bufferPos * baseBlockSize_]; - } else if (isCache1SameAddr_) { - return groupCache1_.template GetCacheHead()[bufferPos * baseBlockSize_]; - } else { - ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Please call GetBuffer only when Hit is true."); }); - return NULL_TENSOR; - } - } - - __aicore__ inline void SetOrgAddr(__gm__ TransT* inputAddr) - { - inputAddr_ = inputAddr; - if (!groupCache0_.template Hit(inputAddr_) && !groupCache1_.template Hit(inputAddr_)) { - GlobalCache* curGroupCache = isCache0_ ? &groupCache0_ : &groupCache1_; - curGroupCache->template ClearCache(); - } - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - if (isCache0SameAddr_) { - groupCache0_.template EnQue(tensor); - if (MATMUL_MODULE(CubeInBufferParams)->IsTailBlock()) { - groupCache0_.ReduceCacheSize(); - } - } else if (isCache1SameAddr_) { - groupCache1_.template EnQue(tensor); - if (MATMUL_MODULE(CubeInBufferParams)->IsTailBlock()) { - groupCache1_.ReduceCacheSize(); - } - } - } - - __aicore__ inline void DeQue() - { - if (isCache0SameAddr_) { - groupCache0_.template DeQue(); - } else if (isCache1SameAddr_) { - groupCache1_.template DeQue(); - } - } - - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return MATMUL_MODULE(CubeInBufferParams)->GetIterIndex(curRow, curCol); - } - -private: - GlobalCache groupCache0_; - GlobalCache groupCache1_; - __gm__ TransT* inputAddr_; - int32_t baseBlockSize_; - bool isCache0_ { true }; - bool isCache0SameAddr_ { false }; - bool isCache1SameAddr_ { false }; -}; - -} -#endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file cube_in_buffer_double_global_buffer.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H + +#include "cube_in_buffer_intf.h" +#include "global_cache.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeInBuffer() == CubeInBufferType::DOUBLE_GLOBAL_BUFFER>> { + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) + { + baseBlockSize_ = baseBlockSize; + groupCache0_.Init(); + groupCache1_.Init(); + int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + groupCache0_.InitBuffer(matrixByteSize * cacheNum); + groupCache1_.InitBuffer(matrixByteSize * cacheNum); + } + + __aicore__ inline void Destroy() {} + + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for global que version should not be -1."); }); + if (isCache0SameAddr_) { + return groupCache0_.template GetCacheHead()[bufferPos * baseBlockSize_]; + } else if (isCache1SameAddr_) { + return groupCache1_.template GetCacheHead()[bufferPos * baseBlockSize_]; + } else { + GlobalCache* curGroupCache = isCache0_ ? &groupCache0_ : &groupCache1_; + GlobalTensor inputTensor; + inputTensor.SetGlobalBuffer(inputAddr_); + curGroupCache->template SetOrgTensor(inputTensor); + isCache0_ = !isCache0_; + return curGroupCache->template AllocTensor(); + } + } + + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) + { + (void) bufferPos; + (void) tensor; + } + + __aicore__ inline void Reset() {} + + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + GlobalTensor inputTensor; + inputTensor.SetGlobalBuffer(inputAddr_); + isCache0SameAddr_ = groupCache0_.template Hit(inputTensor); + isCache1SameAddr_ = groupCache1_.template Hit(inputTensor); + return (isCache0SameAddr_ && (iterIndex + 1 <= groupCache0_.GetCacheSize())) || + (isCache1SameAddr_ && (iterIndex + 1 <= groupCache1_.GetCacheSize())); + } + + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + if (isCache0SameAddr_) { + return groupCache0_.template GetCacheHead()[bufferPos * baseBlockSize_]; + } else if (isCache1SameAddr_) { + return groupCache1_.template GetCacheHead()[bufferPos * baseBlockSize_]; + } else { + ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Please call GetBuffer only when Hit is true."); }); + return NULL_TENSOR; + } + } + + __aicore__ inline void SetOrgTensor(GlobalTensor globalMatrix) + { + inputAddr_ = globalMatrix.address_; + if (!groupCache0_.template Hit(globalMatrix) && !groupCache1_.template Hit(globalMatrix)) { + GlobalCache* curGroupCache = isCache0_ ? &groupCache0_ : &groupCache1_; + curGroupCache->template ClearCache(); + } + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + if (isCache0SameAddr_) { + groupCache0_.template EnQue(tensor); + if (IsTailBlock()) { + groupCache0_.ReduceCacheSize(); + } + } else if (isCache1SameAddr_) { + groupCache1_.template EnQue(tensor); + if (IsTailBlock()) { + groupCache1_.ReduceCacheSize(); + } + } + } + + __aicore__ inline void DeQue() + { + if (isCache0SameAddr_) { + groupCache0_.template DeQue(); + } else if (isCache1SameAddr_) { + groupCache1_.template DeQue(); + } + } + +private: + __aicore__ inline bool IsTailBlock() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return (MATMUL_PARAM_VAR.baseUseM_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) || (MATMUL_PARAM_VAR.baseUseK_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()); + } else { + return (MATMUL_PARAM_VAR.baseUseN_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) || (MATMUL_PARAM_VAR.baseUseK_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()); + } + } + + GlobalCache groupCache0_; + GlobalCache groupCache1_; + __gm__ TransT* inputAddr_; + int32_t baseBlockSize_; + bool isCache0_ { true }; + bool isCache0SameAddr_ { false }; + bool isCache1SameAddr_ { false }; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h index 954abfdb812c33e8a4c95abc70c5c0fb14fa6794..40ac8e1b42c3dec070157afbcf1393dccd3c94cb 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h @@ -1,114 +1,122 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file cube_in_buffer_intf.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_INTF_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_INTF_H - -#include "../../matmul_module.h" -#include "../../../matmul_utils.h" -#include "cube_in_buffer_utils.h" -#include "cube_in_buffer_params.h" - -namespace matmul { -/* -CubeInBuffer: responsible for L1 buffer management. -This module provides ablities to allocate or free one l1 buffer block, and pipeline syncronization. -*/ -template -class CubeInBuffer { - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - /** - * @description: Init of buffer, should be called when matmul is inited. - * @param: baseBlockSize: element nums of basic block when loading to L1 - * @param: cacheNum: describe the nums of basic block when loading to L1 - * @return: void - */ - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) {} - /** - * @description: Reset all should be called when matmul end - * @param: void - * @return: void - */ - __aicore__ inline void Destroy() {} - /** - * @description: Get current index of iteration - * @param: curRow: current row index - * @param: curCol: current col index - * @return: current index of iteration - */ - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return 0; - } - /** - * @description: Judge if data of current iteration is already in buffer - * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex - * @param: bufferPos: current buffer position - * @return: true if already in buffer, else false - */ - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - return false; - } - /** - * @description: Get buffer only when hit - * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex - * @param: bufferPos: current buffer position - * @return: tensor on L1 - */ - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - return NULL_TENSOR; - } - /** - * @description: Allocate one block of buffer, should be called only when current iterindex does not hit - * @param: bufferPos: current buffer position - * @return: void - */ - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - return NULL_TENSOR; - } - /** - * @description: Free tensor, should be called after AllocTensor - * @param: bufferPos: current buffer position - * @param: tensor: tensor allocated by AllocTensor or NULL_TENSOR - * @return: void - */ - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) {} - /** - * @description: Reset the status of que in CubeInBuffer - * @return: void - */ - __aicore__ inline void Reset() {} - /** - * @description: Put tensor to buffer que - * @param: tensor: target tensor on L1 - * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex - * @return: void - */ - __aicore__ inline void EnQue(LocalTensor& tensor) {} - /** - * @description: Fetch tensor from que - * @param: void - * @return: void - */ - __aicore__ inline void DeQue() {} -}; - -} -#endif // _CUBE_IN_BUFFER_INTF_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file cube_in_buffer_intf.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_INTF_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_INTF_H + +#include "../../matmul_module.h" +#include "../../../matmul_utils.h" +#include "cube_in_buffer_utils.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +/* +CubeInBuffer: responsible for L1 buffer management. +This module provides ablities to allocate or free one l1 buffer block, and pipeline syncronization. +*/ +template +class CubeInBuffer { + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + /** + * @description: Init of buffer, should be called when matmul is inited. + * @param: baseBlockSize: element nums of basic block when loading to L1 + * @param: cacheNum: describe the nums of basic block when loading to L1 + * @return: void + */ + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) {} + /** + * @description: Reset all should be called when matmul end + * @param: void + * @return: void + */ + __aicore__ inline void Destroy() {} + /** + * @description: Get current index of iteration + * @param: curRow: current row index + * @param: curCol: current col index + * @return: current index of iteration + */ + __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) + { + return 0; + } + /** + * @description: Judge if data of current iteration is already in buffer + * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex + * @param: bufferPos: current buffer position + * @return: true if already in buffer, else false + */ + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + return false; + } + /** + * @description: Get buffer only when hit + * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex + * @param: bufferPos: current buffer position + * @return: tensor on L1 + */ + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + return NULL_TENSOR; + } + /** + * @description: Allocate one block of buffer, should be called only when current iterindex does not hit + * @param: bufferPos: current buffer position + * @return: void + */ + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + return NULL_TENSOR; + } + /** + * @description: Free tensor, should be called after AllocTensor + * @param: bufferPos: current buffer position + * @param: tensor: tensor allocated by AllocTensor or NULL_TENSOR + * @return: void + */ + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) {} + /** + * @description: Reset the status of que in CubeInBuffer + * @return: void + */ + __aicore__ inline void Reset() {} + /** + * @description: Put tensor to buffer que + * @param: tensor: target tensor on L1 + * @param: iterIndex: current index of iteration, can be fetch by calling GetIterIndex + * @return: void + */ + __aicore__ inline void EnQue(LocalTensor& tensor) {} + /** + * @description: Fetch tensor from que + * @param: void + * @return: void + */ + __aicore__ inline void DeQue() {} +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_INTF_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h index 8e40792cb9b53d5df58b8f230c546bbbce26315b..713e441a724ba521eb5657944497c33d6bd37ae4 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h @@ -1,160 +1,183 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -/*! -* \file cube_in_buffer_normal.h -* \brief -*/ - -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_NORMAL_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_NORMAL_H - -#include "cube_in_buffer_intf.h" - -namespace matmul { -template -class CubeInBuffer() == CubeInBufferType::NORMAL>> { - MATMUL_USE_MODULE_ON(CubeInBufferParams, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) - { - baseBlockSize_ = baseBlockSize; - int32_t matrixByteSize = baseBlockSize_ * GetBitSize() / ONE_BYTE_BIT_SIZE; - int32_t reduceAxisCnt = MATMUL_MODULE(CubeInBufferParams)->GetInnerLoopCnt(); - auto tpipePtr = GetTPipePtr(); - if (cacheNum > DB_FACTOR) { - if (cacheNum < reduceAxisCnt * MATMUL_MODULE(CubeInBufferParams)->GetMajorCacheNum()) { - // k not full load - cacheSize_ = cacheNum - DB_FACTOR; - tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); - tpipePtr->InitBuffer(qid_, DB_FACTOR, matrixByteSize); - } else { - // k full load - cacheSize_ = cacheNum; - tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); - } - } else { - if (cacheNum < reduceAxisCnt * MATMUL_MODULE(CubeInBufferParams)->GetMajorCacheNum()) { - // k not full load - cacheSize_ = 0; - tpipePtr->InitBuffer(qid_, cacheNum, matrixByteSize); - } else if (reduceAxisCnt == 1 && cacheNum == DOUBLE_QUE) { - // k full load, db on m axis - cacheSize_ = 0; - tpipePtr->InitBuffer(qid_, DOUBLE_QUE, matrixByteSize); - } else { - // k full load - cacheSize_ = cacheNum; - tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); - } - } - } - - __aicore__ inline void Destroy() - { - if (cacheProc_ > 0) { - ASCENDC_ASSERT((qidCache_.GetState(cacheHead_) != TBufState::FREE), - { KERNEL_LOG(KERNEL_ERROR, "cacheHead_ state can not be TBufState::FREE"); }); - qidCache_.FreeTensor(cacheHead_); - cacheProc_ = 0; - } - qid_.FreeAllEvent(); - qidCache_.FreeAllEvent(); - cacheAlloc_ = false; - } - - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for normal version should not be -1."); }); - if (bufferPos >= cacheSize_) { - cacheAlloc_ = false; - return qid_.template AllocTensor(); - } else if (cacheProc_ == 0) { - cacheHead_ = qidCache_.template AllocTensor(); // To use que to insert events - } else if (cacheProc_ >= cacheSize_) { - ASCENDC_ASSERT((false), { // Logically, it shouldn't be entered. - KERNEL_LOG(KERNEL_ERROR, "illegal branch"); - }); - qidCache_.FreeTensor(cacheHead_); - cacheHead_ = qidCache_.template AllocTensor(); // To use que to insert events - } - ++cacheProc_; - cacheAlloc_ = true; - return cacheHead_[bufferPos * baseBlockSize_]; - } - - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in FreeTensor for normal version should not be -1."); }); - if (bufferPos >= cacheSize_) { - qid_.FreeTensor(const_cast&>(tensor)); - } - } - - __aicore__ inline void Reset() - { - if (cacheProc_ > 0) { - qidCache_.FreeTensor(cacheHead_); - cacheProc_ = 0; - } - } - - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - return (iterIndex < cacheSize_ && iterIndex < cacheProc_); - } - - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - return cacheHead_[iterIndex * baseBlockSize_]; - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - if (cacheAlloc_) { - qidCache_.EnQue(tensor); - } else { - qid_.EnQue(tensor); - } - } - - __aicore__ inline void DeQue() - { - if (cacheAlloc_) { - (void) qidCache_.DeQue(); - } else { - (void) qid_.DeQue(); - } - } - - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return MATMUL_MODULE(CubeInBufferParams)->GetIterIndex(curRow, curCol); - } - -private: - typename CubeInQueType::QUE qid_; - typename CubeInQueType::QUE qidCache_; - LocalTensor cacheHead_; // Allocate and release using qidCache_ - int32_t baseBlockSize_; - int32_t cacheSize_; - int32_t cacheProc_ { 0 }; - bool cacheAlloc_ { false }; -}; - -} -#endif // _CUBE_IN_BUFFER_NORMAL_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! +* \file cube_in_buffer_normal.h +* \brief +*/ + +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_NORMAL_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_NORMAL_H + +#include "cube_in_buffer_intf.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeInBuffer() == CubeInBufferType::NORMAL>> { + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) + { + baseBlockSize_ = baseBlockSize; + int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t reduceAxisCnt = MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); + auto tpipePtr = GetTPipePtr(); + if (cacheNum > DB_FACTOR) { + if (cacheNum < reduceAxisCnt * GetMajorCacheNum()) { + // k not full load + cacheSize_ = cacheNum - DB_FACTOR; + tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); + tpipePtr->InitBuffer(qid_, DB_FACTOR, matrixByteSize); + } else { + // k full load + cacheSize_ = cacheNum; + tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); + } + } else { + if (cacheNum < reduceAxisCnt * GetMajorCacheNum()) { + // k not full load + cacheSize_ = 0; + tpipePtr->InitBuffer(qid_, cacheNum, matrixByteSize); + } else if (reduceAxisCnt == 1 && cacheNum == DOUBLE_QUE) { + // k full load, db on m axis + cacheSize_ = 0; + tpipePtr->InitBuffer(qid_, DOUBLE_QUE, matrixByteSize); + } else { + // k full load + cacheSize_ = cacheNum; + tpipePtr->InitBuffer(qidCache_, SINGLE_QUE, cacheSize_ * matrixByteSize); + } + } + } + + __aicore__ inline void Destroy() + { + if (cacheProc_ > 0) { + ASCENDC_ASSERT((qidCache_.GetState(cacheHead_) != TBufState::FREE), + { KERNEL_LOG(KERNEL_ERROR, "cacheHead_ state can not be TBufState::FREE"); }); + qidCache_.FreeTensor(cacheHead_); + cacheProc_ = 0; + } + qid_.FreeAllEvent(); + qidCache_.FreeAllEvent(); + cacheAlloc_ = false; + } + + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for normal version should not be -1."); }); + if (bufferPos >= cacheSize_) { + cacheAlloc_ = false; + return qid_.template AllocTensor(); + } else if (cacheProc_ == 0) { + cacheHead_ = qidCache_.template AllocTensor(); // To use que to insert events + } else if (cacheProc_ >= cacheSize_) { + ASCENDC_ASSERT((false), { // Logically, it shouldn't be entered. + KERNEL_LOG(KERNEL_ERROR, "illegal branch"); + }); + qidCache_.FreeTensor(cacheHead_); + cacheHead_ = qidCache_.template AllocTensor(); // To use que to insert events + } + ++cacheProc_; + cacheAlloc_ = true; + return cacheHead_[bufferPos * baseBlockSize_]; + } + + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in FreeTensor for normal version should not be -1."); }); + if (bufferPos >= cacheSize_) { + qid_.FreeTensor(const_cast&>(tensor)); + } + } + + __aicore__ inline void Reset() + { + if (cacheProc_ > 0) { + qidCache_.FreeTensor(cacheHead_); + cacheProc_ = 0; + } + } + + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + return (iterIndex < cacheSize_ && iterIndex < cacheProc_); + } + + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + return cacheHead_[iterIndex * baseBlockSize_]; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + if (cacheAlloc_) { + qidCache_.EnQue(tensor); + } else { + qid_.EnQue(tensor); + } + } + + __aicore__ inline void DeQue() + { + if (cacheAlloc_) { + (void) qidCache_.DeQue(); + } else { + (void) qid_.DeQue(); + } + } + +private: + + __aicore__ inline int32_t GetMajorCacheNum() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + if constexpr (DoMatmulSpecialBasicBlock(MM_CFG)) { + return ToMatmulConfig(MM_CFG).stepM; + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } + } else { + if constexpr (DoMatmulSpecialBasicBlock(MM_CFG)) { + return ToMatmulConfig(MM_CFG).stepN; + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } + } + } + + typename CubeInQueType::QUE qid_; + typename CubeInQueType::QUE qidCache_; + LocalTensor cacheHead_; // Allocate and release using qidCache_ + int32_t baseBlockSize_; + int32_t cacheSize_; + int32_t cacheProc_ { 0 }; + bool cacheAlloc_ { false }; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_NORMAL_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h index ca61b1b663834e9209a4fa62371c0cba9a046657..e4acdfa5c01f92d14a4bcaa16dfd0236649b0ca1 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h @@ -1,103 +1,109 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -/*! -* \file cube_in_buffer_single_buffer.h -* \brief -*/ - -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_BUFFER_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_BUFFER_H - -#include "cube_in_buffer_intf.h" - -namespace matmul { - -template -class CubeInBuffer() == CubeInBufferType::SINGLE_BUFFER>> { - MATMUL_USE_MODULE_ON(CubeInBufferParams, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) - { - (void) cacheNum; - int32_t matrixByteSize = baseBlockSize * GetBitSize() / ONE_BYTE_BIT_SIZE; - GetTPipePtr()->InitBuffer(qid_, SINGLE_QUE, - matrixByteSize * MATMUL_MODULE(CubeInBufferParams)->GetTotalCacheNum()); - } - - __aicore__ inline void Destroy() - { - if constexpr (INPUT_TYPE::layout == LayoutMode::NONE) { - if (cacheProc_) { - qid_.FreeTensor(cacheHead_); - cacheProc_ = 0; - } - } - qid_.FreeAllEvent(); - } - - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - cacheHead_ = qid_.template AllocTensor(); - cacheProc_ = 1; - return cacheHead_[0]; - } - - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) - { - if (cacheProc_ > 0) { - cacheProc_ = 0; - qid_.FreeTensor(cacheHead_); - } - } - - __aicore__ inline void Reset() - { - if constexpr (INPUT_TYPE::layout == LayoutMode::NONE) { - FreeTensor(); - } - } - - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - return false; - } - - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - return NULL_TENSOR; - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - qid_.EnQue(tensor); - } - - __aicore__ inline void DeQue() - { - (void) qid_.DeQue(); - } - - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return 0; - } - -private: - typename CubeInQueType::QUE qid_; - LocalTensor cacheHead_; - int32_t cacheProc_ { 0 }; -}; - -} -#endif // _CUBE_IN_BUFFER_SINGLE_BUFFER_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! +* \file cube_in_buffer_single_buffer.h +* \brief +*/ + +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_BUFFER_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_BUFFER_H + +#include "cube_in_buffer_intf.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeInBuffer() == CubeInBufferType::SINGLE_BUFFER>> { + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) + { + (void) cacheNum; + int32_t matrixByteSize = baseBlockSize * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + GetTPipePtr()->InitBuffer(qid_, SINGLE_QUE, matrixByteSize); + } + + __aicore__ inline void Destroy() + { + if constexpr (INPUT_TYPE::layout == LayoutMode::NONE) { + if (cacheProc_) { + qid_.FreeTensor(cacheHead_); + cacheProc_ = 0; + } + } + qid_.FreeAllEvent(); + } + + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + cacheHead_ = qid_.template AllocTensor(); + cacheProc_ = 1; + return cacheHead_[0]; + } + + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) + { + if (cacheProc_ > 0) { + cacheProc_ = 0; + qid_.FreeTensor(cacheHead_); + } + } + + __aicore__ inline void Reset() + { + if constexpr (INPUT_TYPE::layout == LayoutMode::NONE) { + FreeTensor(); + } + } + + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + return false; + } + + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + return NULL_TENSOR; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + qid_.EnQue(tensor); + } + + __aicore__ inline void DeQue() + { + (void) qid_.DeQue(); + } + + __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) + { + return 0; + } + +private: + typename CubeInQueType::QUE qid_; + LocalTensor cacheHead_; + int32_t cacheProc_ { 0 }; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_SINGLE_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h index 63e93a6c6e69427cb88e9be5597e0f163605ca26..b094afb683d7a9d2024990d35e5c47f570dfdb1f 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h @@ -1,105 +1,132 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ - -/*! -* \file cube_in_buffer_single_global_buffer.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H - -#include "cube_in_buffer_intf.h" -#include "global_cache.h" - -namespace matmul { -template -class CubeInBuffer() == CubeInBufferType::SINGLE_GLOBAL_BUFFER>> { - MATMUL_USE_MODULE_ON(CubeInBufferParams, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; -public: - __aicore__ inline CubeInBuffer() {} - __aicore__ inline ~CubeInBuffer() {} - __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) - { - baseBlockSize_ = baseBlockSize; - int32_t matrixByteSize = baseBlockSize_ * GetBitSize() / ONE_BYTE_BIT_SIZE; - GetGlobalCachePtr()->InitBuffer(matrixByteSize * cacheNum); - } - - __aicore__ inline void Destroy() {} - - __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) - { - ASCENDC_ASSERT(bufferPos != -1, - { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for global que version should not be -1."); }); - if (GetGlobalCachePtr()->template Hit(inputAddr_)) { - return GetGlobalCachePtr()->template GetCacheHead()[bufferPos * baseBlockSize_]; - } else { - GetGlobalCachePtr()->template SetOrgAddr(inputAddr_); - return GetGlobalCachePtr()->template AllocTensor(); - } - } - - __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) - { - (void) bufferPos; - (void) tensor; - } - - __aicore__ inline void Reset() {} - - __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - return GetGlobalCachePtr()->template Hit(inputAddr_) && - (iterIndex + 1 <= GetGlobalCachePtr()->GetCacheSize()) && - MATMUL_MODULE(CubeInBufferParams)->IsDataAmountEqual(); - } - - __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) - { - (void) bufferPos; - return GetGlobalCachePtr()->template GetCacheHead()[iterIndex * baseBlockSize_]; - } - - __aicore__ inline void SetOrgAddr(__gm__ TransT* inputAddr) - { - inputAddr_ = inputAddr; - if (!GetGlobalCachePtr()->template Hit(inputAddr_)) { - GetGlobalCachePtr()->template ClearCache(); - } - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - GetGlobalCachePtr()->template EnQue(tensor); - if (MATMUL_MODULE(CubeInBufferParams)->IsTailBlock()) { - GetGlobalCachePtr()->ReduceCacheSize(); - } - } - - __aicore__ inline void DeQue() - { - GetGlobalCachePtr()->template DeQue(); - } - - __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) - { - return MATMUL_MODULE(CubeInBufferParams)->GetIterIndex(curRow, curCol); - } - -private: - int32_t baseBlockSize_; - __gm__ TransT* inputAddr_; -}; - -} -#endif // _CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file cube_in_buffer_single_global_buffer.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H + +#include "cube_in_buffer_intf.h" +#include "global_cache.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeInBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeInBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeInBuffer() == CubeInBufferType::SINGLE_GLOBAL_BUFFER>> { + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; +public: + __aicore__ inline CubeInBuffer() {} + __aicore__ inline ~CubeInBuffer() {} + __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) + { + baseBlockSize_ = baseBlockSize; + int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + GetGlobalCachePtr()->InitBuffer(matrixByteSize * cacheNum); + } + + __aicore__ inline void Destroy() {} + + __aicore__ inline LocalTensor AllocTensor(int32_t bufferPos = -1) + { + ASCENDC_ASSERT(bufferPos != -1, + { KERNEL_LOG(KERNEL_ERROR, "bufferPos in AllocTensor for global que version should not be -1."); }); + GlobalTensor inputTensor; + inputTensor.SetGlobalBuffer(inputAddr_); + if (GetGlobalCachePtr()->template Hit(inputTensor)) { + return GetGlobalCachePtr()->template GetCacheHead()[bufferPos * baseBlockSize_]; + } else { + GetGlobalCachePtr()->template SetOrgTensor(inputTensor); + return GetGlobalCachePtr()->template AllocTensor(); + } + } + + __aicore__ inline void FreeTensor(int32_t bufferPos = -1, const LocalTensor& tensor = NULL_TENSOR) + { + (void) bufferPos; + (void) tensor; + } + + __aicore__ inline void Reset() {} + + __aicore__ inline bool Hit(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + GlobalTensor inputTensor; + inputTensor.SetGlobalBuffer(inputAddr_); + return GetGlobalCachePtr()->template Hit(inputTensor) && + (iterIndex + 1 <= GetGlobalCachePtr()->GetCacheSize()) && IsDataAmountEqual(); + } + + __aicore__ inline LocalTensor GetBuffer(int32_t iterIndex, int32_t bufferPos = -1) + { + (void) bufferPos; + return GetGlobalCachePtr()->template GetCacheHead()[iterIndex * baseBlockSize_]; + } + + __aicore__ inline void SetOrgTensor(const GlobalTensor& globalMatrix) + { + inputAddr_ = globalMatrix.address_; + if (!GetGlobalCachePtr()->template Hit(globalMatrix)) { + GetGlobalCachePtr()->template ClearCache(); + } + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + GetGlobalCachePtr()->template EnQue(tensor); + if (IsTailBlock()) { + GetGlobalCachePtr()->ReduceCacheSize(); + } + } + + __aicore__ inline void DeQue() + { + GetGlobalCachePtr()->template DeQue(); + } + +private: + __aicore__ inline bool IsTailBlock() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return (MATMUL_PARAM_VAR.baseUseM_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) || (MATMUL_PARAM_VAR.baseUseK_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()); + } else { + return (MATMUL_PARAM_VAR.baseUseN_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) || (MATMUL_PARAM_VAR.baseUseK_ != MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()); + } + } + + __aicore__ inline bool IsDataAmountEqual() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return (MATMUL_PARAM_VAR.baseUseM_ == MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) && + (MATMUL_PARAM_VAR.baseUseK_ == MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()); + } else { + return (MATMUL_PARAM_VAR.baseUseK_ == MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK()) && + (MATMUL_PARAM_VAR.baseUseN_ == MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + } + } + + int32_t baseBlockSize_; + __gm__ TransT* inputAddr_; +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h index 62bf24eff93620aac81d914c282a49ee85590be7..e2d8d579ec8d9cab30dd32e83a321419eb5b51aa 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h @@ -1,92 +1,96 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file cube_in_buffer_utils.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_UTILS_H_ -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_UTILS_H_ - -#include "../../matmul_type_def.h" - -namespace matmul { - -template -struct CubeInQueType { -#if __CCE_AICORE__ == 220 - using QUE = TQueBind; -#else - using QUE = TQueBind; -#endif -}; - -constexpr int32_t DOUBLE_QUE = 2; -constexpr int32_t SINGLE_QUE = 1; - -enum class CubeInBufferType : uint8_t { - NONE = 0, - NORMAL = 1, - SINGLE_BUFFER, - DOUBLE_BUFFER, - SINGLE_GLOBAL_BUFFER, - DOUBLE_GLOBAL_BUFFER, -}; - -template -__aicore__ inline constexpr bool IsSetSingleGlobalQue() -{ - return INPUT_TYPE::ibShare && !ToMatmulConfig(MM_CFG).enableDoubleCache; -} - -template -__aicore__ inline constexpr bool IsSetDoubleGlobalQue() -{ - return INPUT_TYPE::ibShare && ToMatmulConfig(MM_CFG).enableDoubleCache; -} - -template -__aicore__ inline constexpr bool IsSetNoDB() -{ - return IsBasic(MM_CFG) || (INPUT_TYPE::TAG == InputTypeTag::B && ToMatmulConfig(MM_CFG).intraBlockPartSum) || - (INPUT_TYPE::layout != LayoutMode::NONE && ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1); -} - -template -__aicore__ inline constexpr CubeInBufferType GetCubeInBufferType() -{ - if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { - return CubeInBufferType::NONE; - } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { - if constexpr (IsSetDoubleGlobalQue()) { - return CubeInBufferType::DOUBLE_GLOBAL_BUFFER; - } else if (IsSetSingleGlobalQue()) { - return CubeInBufferType::SINGLE_GLOBAL_BUFFER; - } else { - return CubeInBufferType::NORMAL; - } - } else if constexpr (DoMatmulNorm(MM_CFG)) { - if constexpr (IsSetNoDB()) { - return CubeInBufferType::SINGLE_BUFFER; - } else { - return CubeInBufferType::NORMAL; - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - return CubeInBufferType::DOUBLE_BUFFER; - } else if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) { - return CubeInBufferType::NORMAL; - } else { - return CubeInBufferType::NONE; - } -} - -} +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_in_buffer_utils.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_UTILS_H_ +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_IN_BUFFER_CUBE_IN_BUFFER_UTILS_H_ + +#include "../../matmul_type_def.h" + +namespace Gemm { +namespace Impl { +namespace Detail { + +template +struct CubeInQueType { +#if __CCE_AICORE__ == 220 + using QUE = TQueBind; +#else + using QUE = TQueBind; +#endif +}; + +constexpr int32_t DOUBLE_QUE = 2; +constexpr int32_t SINGLE_QUE = 1; + +enum class CubeInBufferType : uint8_t { + NONE = 0, + NORMAL = 1, + SINGLE_BUFFER, + DOUBLE_BUFFER, + SINGLE_GLOBAL_BUFFER, + DOUBLE_GLOBAL_BUFFER, +}; + +template +__aicore__ inline constexpr bool IsSetSingleGlobalQue() +{ + return INPUT_TYPE::ibShare && !ToMatmulConfig(MM_CFG).enableDoubleCache; +} + +template +__aicore__ inline constexpr bool IsSetDoubleGlobalQue() +{ + return INPUT_TYPE::ibShare && ToMatmulConfig(MM_CFG).enableDoubleCache; +} + +template +__aicore__ inline constexpr bool IsSetNoDB() +{ + return IsBasic(MM_CFG) || (INPUT_TYPE::TAG == InputTypeTag::B && ToMatmulConfig(MM_CFG).intraBlockPartSum) || + (INPUT_TYPE::layout != LayoutMode::NONE && ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1); +} + +template +__aicore__ inline constexpr CubeInBufferType GetCubeInBufferType() +{ + if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { + return CubeInBufferType::NONE; + } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { + if constexpr (IsSetDoubleGlobalQue()) { + return CubeInBufferType::DOUBLE_GLOBAL_BUFFER; + } else if (IsSetSingleGlobalQue()) { + return CubeInBufferType::SINGLE_GLOBAL_BUFFER; + } else { + return CubeInBufferType::NORMAL; + } + } else if constexpr (DoMatmulNorm(MM_CFG)) { + if constexpr (IsSetNoDB()) { + return CubeInBufferType::SINGLE_BUFFER; + } else { + return CubeInBufferType::NORMAL; + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + return CubeInBufferType::DOUBLE_BUFFER; + } else if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) { + return CubeInBufferType::NORMAL; + } else { + return CubeInBufferType::NONE; + } +} + +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // _CUBE_IN_BUFFER_UTILS_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h index dd74925eadac8b80fdecc87e97de22f9f1cf851e..0b049e70b3656182557623101a61767898c86f04 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h +++ b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h @@ -14,16 +14,22 @@ #ifndef IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_ #define IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_ -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { class GlobalCache; -} -__BLOCK_LOCAL__ __inline__ matmul::GlobalCache* gL1Cache; -__aicore__ inline matmul::GlobalCache* GetGlobalCachePtr() +} // namespace Detail +} // namespace Impl +} // namespace Gemm +__BLOCK_LOCAL__ __inline__ Gemm::Impl::Detail::GlobalCache* gL1Cache; +__aicore__ inline Gemm::Impl::Detail::GlobalCache* GetGlobalCachePtr() { return gL1Cache; } -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { class GlobalCache { public: @@ -36,45 +42,6 @@ public: alloc_ = false; } - template - __aicore__ inline void InitBuffer(const TCubeTiling* __restrict cubeTiling, TPipe* tpipe) - { - using SrcT = typename A_TYPE::T; - constexpr int32_t c0Size_ = GetC0Size(); - int32_t sizeMatrix; - uint16_t alignedDepthB1 = cubeTiling->depthB1; - if constexpr(B_TYPE::ibShare) { - int baseKN; - // float input case, k_l1_b will be aligned to 16, b matrix L1 size will be larger than expected - if constexpr (IsSameType::value) { - uint16_t alignedBaseK = ConstCeil(cubeTiling->baseK, BLOCK_CUBE) * BLOCK_CUBE; - baseKN = alignedBaseK * cubeTiling->baseN; - ASCENDC_ASSERT((baseKN > 0), - { KERNEL_LOG(KERNEL_ERROR, "baseKN_ is %d, which should be larger than 0", baseKN); }); - // check L1 size after using aligned kb - if ((baseKN * cubeTiling->depthA1 + baseKN * alignedDepthB1) * sizeof(float) > TOTAL_L1_SIZE) { - // exceeding L1 size, decrease depth b1 - alignedDepthB1 = cubeTiling->baseK * cubeTiling->baseN * alignedDepthB1 / baseKN; - } - ASCENDC_ASSERT((alignedDepthB1 > 0), { - KERNEL_LOG(KERNEL_ERROR, "alignedDepthB1 is %d, which should be larger than 0", alignedDepthB1); - }); - } else if constexpr (IsSameType::value) { - baseKN = ConstCeil(cubeTiling->baseK, c0Size_) * c0Size_ * - ConstCeil(cubeTiling->baseN, c0Size_) * c0Size_; - } else { - baseKN = cubeTiling->baseK * cubeTiling->baseN; - } - sizeMatrix = alignedDepthB1 * baseKN * sizeof(SrcT); - } else if constexpr (A_TYPE::ibShare) { - int baseMK = cubeTiling->baseM * cubeTiling->baseK; - sizeMatrix = cubeTiling->depthA1 * baseMK * sizeof(SrcT); - } else { - return; - } - tpipe->InitBuffer(cacheQue_, 1, sizeMatrix); - } - __aicore__ inline void InitBuffer(int32_t baseBlockSize) { if (!isInited_) { @@ -84,9 +51,9 @@ public: } template - __aicore__ inline bool Hit(__gm__ SrcT* gmAddr) + __aicore__ inline bool Hit(const GlobalTensor& globalMatrix) { - return (alloc_ && (reinterpret_cast(gmAddr) == srcAddr_)); + return (alloc_ && (reinterpret_cast(globalMatrix.address_) == srcAddr_)); } template @@ -145,20 +112,23 @@ public: } template - __aicore__ inline void SetCacheHead(LocalTensor& cacheHead) + __aicore__ inline void SetCacheHead(LocalTensor& cacheHead) { cacheHead_ = cacheHead.address_; } template - __aicore__ inline void SetOrgAddr(__gm__ SrcT* gmAddr) + __aicore__ inline void SetOrgTensor(const GlobalTensor& globalMatrix) { - srcAddr_ = reinterpret_cast(gmAddr); + srcAddr_ = reinterpret_cast(globalMatrix.address_); } - __aicore__ inline GM_ADDR GetOrgAddr() + template + __aicore__ inline GlobalTensor GetOrgTensor() { - return srcAddr_; + GlobalTensor globalMatrix; + globalMatrix.SetGlobalBuffer(srcAddr_); + return globalMatrix; } __aicore__ inline void FreeAllEvent() @@ -186,5 +156,7 @@ public: bool isInited_ { false }; }; -} +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // _GLOBAL_CACHE_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h index 5fb807700d9e08d057499f06bcf782b4f9812c9d..a85f717b299076331496e128216e0287bf36cf73 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h @@ -1,44 +1,47 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file cube_out_buffer.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H - -namespace matmul { -enum class UNIT_FLAG_CTRL : uint8_t { - DISABLE, - RESERVED, - CHECK, - SET, -}; - -// L0cType -template -struct L0cType { - __aicore__ inline L0cType() {}; -}; - -template <> -struct L0cType { - __aicore__ inline L0cType() {}; - using BUFFER = TBuf; -}; - -template <> -struct L0cType { - __aicore__ inline L0cType() {}; - using BUFFER = TQue; -}; -} - +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_out_buffer.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H + +namespace Gemm { +namespace Impl { +namespace Detail { +enum class UNIT_FLAG_CTRL : uint8_t { + DISABLE, + RESERVED, + CHECK, + SET, +}; + +// L0cType +template +struct L0cType { + __aicore__ inline L0cType() {}; +}; + +template <> +struct L0cType { + __aicore__ inline L0cType() {}; + using BUFFER = TBuf; +}; + +template <> +struct L0cType { + __aicore__ inline L0cType() {}; + using BUFFER = TQue; +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h index 90d4b379777a9e8af95909c73e447c095e70f7ae..2c7637adb76adcec3fe67639c50e986d945b20bb 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h @@ -1,81 +1,89 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file cube_out_buffer_no_unit_flag.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H - -#include "cube_out_buffer_base.h" -#include "../../../matmul_utils.h" -#include "lib/matmul/tiling.h" - -namespace matmul { - -template -class CubeOutBuffer -{ -public: - __aicore__ inline CubeOutBuffer() {}; - __aicore__ inline ~CubeOutBuffer() {}; - __aicore__ inline void Init(int32_t cacheSize = 1, uint32_t lenFactor = 1) - { - constexpr int32_t DB_NUM = 2; - if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT || DoMatmulSpecialMDL(MM_CFG)) { - lenFactor = DB_NUM; - } - if (MATMUL_PARAM_VAR.tiling_.GetDbL0C() == DB_NUM) { - GetTPipePtr()->InitBuffer( - CO1_, DB_NUM, lenFactor * cacheSize * sizeof(L0cT)); - } else { - GetTPipePtr()->InitBuffer( - CO1_, 1, lenFactor * cacheSize * sizeof(L0cT)); - } - } - - __aicore__ inline LocalTensor AllocTensor() - { - cMatrix_ = CO1_.template AllocTensor(); - return cMatrix_; - } - - __aicore__ inline LocalTensor GetTensor() - { - return cMatrix_; - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - { - CO1_.EnQue(tensor); - } - - __aicore__ inline LocalTensor DeQue() - { - return CO1_.template DeQue(); - } - - __aicore__ inline void FreeTensor(LocalTensor& co1Local) - { - CO1_.FreeTensor(co1Local); - } - - __aicore__ inline void Destroy() - { - CO1_.FreeAllEvent(); - } - -private: - typename L0cType::BUFFER CO1_; - LocalTensor cMatrix_; -}; -} - +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_out_buffer_no_unit_flag.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H + +#include "cube_out_buffer_base.h" +#include "../../../matmul_utils.h" +#include "lib/matmul/tiling.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeOutBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeOutBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeOutBuffer +{ + MATMUL_USE_MODULE(MatmulShapeTiling) +public: + __aicore__ inline CubeOutBuffer() {}; + __aicore__ inline ~CubeOutBuffer() {}; + __aicore__ inline void Init(int32_t cacheSize = 1, uint32_t lenFactor = 1) + { + constexpr int32_t DB_NUM = 2; + if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT || DoMatmulSpecialMDL(MM_CFG)) { + lenFactor = DB_NUM; + } + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDbL0C() == DB_NUM) { + GetTPipePtr()->InitBuffer( + CO1_, DB_NUM, lenFactor * cacheSize * sizeof(L0cT)); + } else { + GetTPipePtr()->InitBuffer( + CO1_, 1, lenFactor * cacheSize * sizeof(L0cT)); + } + } + + __aicore__ inline LocalTensor AllocTensor() + { + cMatrix_ = CO1_.template AllocTensor(); + return cMatrix_; + } + + __aicore__ inline LocalTensor GetTensor() + { + return cMatrix_; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + { + CO1_.EnQue(tensor); + } + + __aicore__ inline LocalTensor DeQue() + { + return CO1_.template DeQue(); + } + + __aicore__ inline void FreeTensor(LocalTensor& co1Local) + { + CO1_.FreeTensor(co1Local); + } + + __aicore__ inline void Destroy() + { + CO1_.FreeAllEvent(); + } + +private: + typename L0cType::BUFFER CO1_; + LocalTensor cMatrix_; +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h index 3032f5ed59216b8f158f5cdfa86842fe6437091c..6051bb610d004bbe7b55852bea73e9638c9321a1 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h @@ -1,75 +1,82 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file cube_out_buffer_unit_flag.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H -#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H - -#include "cube_out_buffer_base.h" -#include "../../../matmul_utils.h" -#include "lib/matmul/tiling.h" -#include "../../feature_trait/matmul_feature_trait.h" - -namespace matmul { - -template -class CubeOutBuffer::IsUnitFlagEnabled()>> -{ -public: - __aicore__ inline CubeOutBuffer() {}; - __aicore__ inline ~CubeOutBuffer() {}; - __aicore__ inline void Init(int32_t cacheSize = 1, uint32_t lenFactor = 1) - { - constexpr int32_t DB_NUM = 2; - if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT || DoMatmulSpecialMDL(MM_CFG)) { - lenFactor = DB_NUM; - } - GetTPipePtr()->InitBuffer(CO1_, lenFactor * cacheSize * sizeof(L0cT)); - } - - __aicore__ inline LocalTensor AllocTensor() - { - cMatrix_ = CO1_.template Get(); - return cMatrix_; - } - - __aicore__ inline LocalTensor GetTensor() - { - return cMatrix_; - } - - __aicore__ inline void EnQue(LocalTensor& tensor) - {} - - __aicore__ inline LocalTensor DeQue() - { - return cMatrix_; - } - - __aicore__ inline void FreeTensor(LocalTensor &co1Local) - {} - - __aicore__ inline void Destroy() - { - event_t eventIDFixToM = static_cast(GetTPipePtr()->FetchEventID(HardEvent::FIX_M)); - SetFlag(eventIDFixToM); - WaitFlag(eventIDFixToM); - } - -private: - typename L0cType::BUFFER CO1_; - LocalTensor cMatrix_; -}; -} - +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_out_buffer_unit_flag.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H +#define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H + +#include "cube_out_buffer_base.h" +#include "../../../matmul_utils.h" +#include "lib/matmul/tiling.h" +#include "../../feature_trait/matmul_feature_trait.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CubeOutBuffer is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CubeOutBuffer is only for internal usage, does not support extension or customized specialization! +*/ +template +class CubeOutBuffer::IsUnitFlagEnabled()>> +{ +public: + __aicore__ inline CubeOutBuffer() {}; + __aicore__ inline ~CubeOutBuffer() {}; + __aicore__ inline void Init(int32_t cacheSize = 1, uint32_t lenFactor = 1) + { + constexpr int32_t DB_NUM = 2; + if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT || DoMatmulSpecialMDL(MM_CFG)) { + lenFactor = DB_NUM; + } + GetTPipePtr()->InitBuffer(CO1_, lenFactor * cacheSize * sizeof(L0cT)); + } + + __aicore__ inline LocalTensor AllocTensor() + { + cMatrix_ = CO1_.template Get(); + return cMatrix_; + } + + __aicore__ inline LocalTensor GetTensor() + { + return cMatrix_; + } + + __aicore__ inline void EnQue(LocalTensor& tensor) + {} + + __aicore__ inline LocalTensor DeQue() + { + return cMatrix_; + } + + __aicore__ inline void FreeTensor(LocalTensor &co1Local) + {} + + __aicore__ inline void Destroy() + { + event_t eventIDFixToM = static_cast(GetTPipePtr()->FetchEventID(HardEvent::FIX_M)); + SetFlag(eventIDFixToM); + WaitFlag(eventIDFixToM); + } + +private: + typename L0cType::BUFFER CO1_; + LocalTensor cMatrix_; +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h index a8286998b0824ccfcaf762bf2a7cc0c6ba9e8eaf..0625a308a7f43478bb72aee7114526b303d2e385 100644 --- a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h +++ b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h @@ -1,65 +1,68 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file async_tensor.h - * \brief - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H - -namespace matmul { -template -class AsyncTensor { -public: - __aicore__ inline AsyncTensor() = default; - __aicore__ inline AsyncTensor(const LocalTensor& tensor, bool needDeque) - { - tensor_ = tensor; - needDeque_ = needDeque; - } - - __aicore__ inline AsyncTensor operator = (const AsyncTensor& other) - { - if (this != &other) { - tensor_ = other.tensor_; - needDeque_ = other.needDeque_; - } - return *this; - } - - __aicore__ inline AsyncTensor (const AsyncTensor& other) - { - if (this != &other) { - tensor_ = other.tensor_; - needDeque_ = other.needDeque_; - } - } - - __aicore__ inline ~AsyncTensor() = default; - - __aicore__ inline LocalTensor Get() const - { - return tensor_; - } - - __aicore__ inline bool IsNeedDeQue() const - { - return needDeque_; - } - -private: - LocalTensor tensor_; - bool needDeque_; -}; -} // namespace matmul - -#endif //_ASYNC_TENSOR_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file async_tensor.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class AsyncTensor { +public: + __aicore__ inline AsyncTensor() = default; + __aicore__ inline AsyncTensor(const LocalTensor& tensor, bool needDeque) + { + tensor_ = tensor; + needDeque_ = needDeque; + } + + __aicore__ inline AsyncTensor operator = (const AsyncTensor& other) + { + if (this != &other) { + tensor_ = other.tensor_; + needDeque_ = other.needDeque_; + } + return *this; + } + + __aicore__ inline AsyncTensor (const AsyncTensor& other) + { + if (this != &other) { + tensor_ = other.tensor_; + needDeque_ = other.needDeque_; + } + } + + __aicore__ inline ~AsyncTensor() = default; + + __aicore__ inline LocalTensor Get() const + { + return tensor_; + } + + __aicore__ inline bool IsNeedDeQue() const + { + return needDeque_; + } + +private: + LocalTensor tensor_; + bool needDeque_; +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif //_ASYNC_TENSOR_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h index ab2cf64c66a4210c4bfd7fd1b545e0b47d1032f2..f308662c5049f8bf490c93d68566c49cb03bd694 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -1,297 +1,448 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -/*! - * \file batch_copy_cube_in.h - * \brief - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H - -#include "batch_copy_cube_in_intf.h" -#include "batch_layout.h" -#include "../data_copy_wrapper.h" -#include "../../../resource/cube_in_buffer/cube_in_buffer.h" -#include "../copy_cube_in_params.h" -#include "../../../matmul_var.h" - -namespace matmul { -// Specialized Template Class of Batch Matmul CopyIn -// Batch Matmul ND Format Data CopyIn From GM/UB -template -class BatchCopyCubeIn::IsNeedUB() && - GetCopyCubeInType() == CopyCubeInType::BMM && - INPUT_TYPE::format == CubeFormat::ND>> -{ - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(BatchLayout, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeTiling, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; - -public: - inline __aicore__ BatchCopyCubeIn() = default; - inline __aicore__ ~BatchCopyCubeIn() = default; - - __aicore__ inline void Init() - { - MATMUL_MODULE(CubeInBuffer) - ->Init(MATMUL_MODULE(MatmulShapeTiling)->GetBatchNum() * - MATMUL_MODULE(CopyCubeInParams)->template GetSingleSizeAlign(), - 1); - } - - __aicore__ inline void SetInput(__gm__ SrcT *srcGlobalAddr, bool isTranspose = false) - { - MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose = false) - {} - - __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, - const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) - { - ASCENDC_ASSERT((MATMUL_MODULE(BatchLayout)->IsLayoutGValid()), { - KERNEL_LOG(KERNEL_ERROR, "multi batch calculation of multiple lines of S is not supported"); - }); - if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) { - return CopyBatchToCube < true, - INPUT_TYPE::TAG == InputTypeTag::A > (dstTensor, matrixStride, outerIdx, splitIdx, splitSize, - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight()); - } else { - return CopyBatchToCube < false, - INPUT_TYPE::TAG == InputTypeTag::B > (dstTensor, matrixStride, outerIdx, splitIdx, splitSize, - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth()); - } - } - - __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) - { - LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); - return localTensor; - } - - __aicore__ inline void BatchDestroy() - { - MATMUL_MODULE(CubeInBuffer)->FreeTensor(); - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } - - __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) - { - return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); - } - - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) - {} - - __aicore__ inline void Destroy() - { - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } - - __aicore__ inline void Reset() - { - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - -private: - template - __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, const uint32_t matrixStride, - const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize, - const int32_t height, int32_t width, int32_t varHeight, int32_t varWidth) - { - auto srcStride = MATMUL_MODULE(BatchLayout)->GetSrcStride(matrixStride, varHeight, varWidth); - auto srcDValue = MATMUL_MODULE(BatchLayout)->template GetSrcDValue(); - auto iterNum = MATMUL_MODULE(BatchLayout)->GetLoopNum(); - auto batchNum = MATMUL_MODULE(BatchLayout)->GetBatchNum() / splitSize; - auto baseSizeAlign = GetSingleSizeAlign(height, width); - auto batchSingleSize = batchNum * MATMUL_MODULE(CopyCubeInParams)->GetSingleSize(); - int64_t batchOffset = outerIdx * MATMUL_MODULE(MatmulShapeTiling)->GetBatchNum() * varHeight * varWidth; - int64_t iterOffset = 0; - uint64_t dstOffset = batchNum * splitIdx * baseSizeAlign; - uint64_t srcOffset = batchNum * splitIdx * srcStride; - for (int32_t idx = 0; idx < iterNum; ++idx) { - dstOffset += iterOffset; - GlobalTensor srcGlobal; - srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); - srcGlobal.SetAddr(iterOffset + batchOffset); - if (srcStride >= UINT16_MAX) { - for (int i = 0; i < batchNum; ++i) { - MATMUL_MODULE(DataCopyUtils) - ->BatchCopyND2NZ(dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, height, width, srcDValue); - dstOffset += baseSizeAlign; - srcOffset += srcStride; - } - } else { - MATMUL_MODULE(DataCopyUtils) - ->BatchCopyND2NZ(dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, height, width, - srcDValue, batchNum, srcStride, baseSizeAlign); - } - iterOffset += batchSingleSize; - } - } - - __aicore__ inline int32_t GetSingleSizeAlign(const int32_t height, const int32_t width) - { - return CeilAlign(height, BLOCK_CUBE) * CeilAlign(width, c0Size_); - } - -private: - constexpr static int32_t c0Size_ = AuxGetC0Size(); -}; - -// Specialized Template Class of Batch Matmul CopyIn -// Batch Matmul NZ Format Data CopyIn From GM/UB -template -class BatchCopyCubeIn::IsNeedUB()) && - GetCopyCubeInType() == CopyCubeInType::BMM && - (INPUT_TYPE::format == CubeFormat::NZ)>> -{ - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(BatchLayout, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeTiling, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; - -public: - inline __aicore__ BatchCopyCubeIn() = default; - inline __aicore__ ~BatchCopyCubeIn() = default; - __aicore__ inline void Init() - { - MATMUL_MODULE(CubeInBuffer) - ->Init(MATMUL_MODULE(MatmulShapeTiling)->GetBatchNum() * - MATMUL_MODULE(CopyCubeInParams)->template GetSingleSizeAlign(), - 1); - } - - __aicore__ inline void SetInput(__gm__ SrcT *srcGlobalAddr, bool isTranspose = false) - { - MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose = false) - {} - - __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, - const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) - { - if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) { - CopyBatchToCube( - dstTensor, outerIdx, splitIdx, splitSize, MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight()); - } else { - CopyBatchToCube( - dstTensor, outerIdx, splitIdx, splitSize, MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetSingleWidth()); - } - } - - __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) - { - LocalTensor localTensor; - localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); - return localTensor; - } - - __aicore__ inline void BatchDestroy() - { - MATMUL_MODULE(CubeInBuffer)->FreeTensor(); - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } - - __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) - { - return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); - } - - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) - {} - - __aicore__ inline void Destroy() - { - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } - - __aicore__ inline void Reset() - { - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - -private: - template - __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, const int32_t outerIdx, - const int32_t splitIdx, const int32_t splitSize, const int32_t height, - const int32_t width, int32_t varHeight, int32_t varWidth) - { - auto batchNum = MATMUL_MODULE(MatmulShapeTiling)->GetBatchNum() / splitSize; - auto singleSizeAlign = GetSingleSizeAlign(height, width); - auto singleSize = GetSingleSizeAlign(varHeight, varWidth); - auto batchOffset = outerIdx * MATMUL_MODULE(MatmulShapeTiling)->GetBatchNum() * singleSize; - bool iskRowDirec = IS_KROW && IsSameTypeV; - - GlobalTensor srcGlobal; - srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); - srcGlobal.SetAddr(batchOffset); - - int32_t alignHeight = CeilAlign(height, BLOCK_CUBE); - int32_t alignWidth = CeilAlign(width, c0Size_); - auto singleSizeAlignDst = GetSingleSizeAlign(height, width); - uint64_t dstOffset = batchNum * splitIdx * singleSizeAlignDst; - uint64_t srcOffset = batchNum * splitIdx * singleSizeAlign; - MATMUL_MODULE(DataCopyUtils) - ->BatchCopyNZ2NZ(dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, alignHeight, alignWidth * batchNum, alignHeight, - iskRowDirec); - } - - template - __aicore__ inline int32_t GetSingleSizeAlign(const int32_t height, const int32_t width) - { - if constexpr (K_IS_ROW && IsSameTypeV && K_ALIGN_C0SIZE) { - return CeilAlign(height, c0Size_) * CeilAlign(width, c0Size_); - } else { - return CeilAlign(height, BLOCK_CUBE) * CeilAlign(width, c0Size_); - } - } - -private: - constexpr static int32_t c0Size_ = AuxGetC0Size(); -}; - -} // namespace matmul -#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! + * \file batch_copy_cube_in.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H + +#include "batch_copy_cube_in_intf.h" +#include "batch_copy_cube_in_params.h" +#include "../data_copy_wrapper.h" +#include "../../../resource/cube_in_buffer/cube_in_buffer.h" +#include "../copy_cube_in_params.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +// Specialized Template Class of Batch Matmul CopyIn +// Batch Matmul ND Format Data CopyIn From GM/UB +template +class BatchCopyCubeIn::IsNeedUB() && + GetCopyCubeInType() == CopyCubeInType::BMM && + INPUT_TYPE::format == CubeFormat::ND>> +{ + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchCopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchDataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; + + __aicore__ inline void Init() + { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * GetSingleSizeAlign(), 1); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose = false) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose = false) + {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + if (IsTranspose()) { + return CopyBatchToCube( + dstTensor, matrixStride, outerIdx, splitIdx, splitSize); + } else { + return CopyBatchToCube( + dstTensor, matrixStride, outerIdx, splitIdx, splitSize); + } + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor localTensor; + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + return localTensor; + } + + __aicore__ inline void BatchDestroy() + { + MATMUL_MODULE(CubeInBuffer)->FreeTensor(); + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) + { + return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + {} + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + +private: + template + __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize ) + { + // Calculate batch outer loop offset + // the parameter false means don't need to use constant parameters + int64_t batchOffset = outerIdx * GetSingleSize() * + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); + + // Calculate iter numbers by line of BSNGD layout + int32_t batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(); // batchA_ or batchB_ + int32_t iterNum = 1; + UpdataBatchNum(batchNum, iterNum); + batchNum /= splitSize; + + // Calculate srcDValue for ND copy + auto srcDValue = MATMUL_MODULE(BatchCopyCubeInParams)->template GetBatchOrgWidth(); + + // Calculate src and dst stride of one step + // if user input matrixStride, use matrixStride as srcStride + auto srcStride = matrixStride != 0 ? matrixStride : GetSrcStride(); + auto dstStride = GetSingleSizeAlign(); + int64_t srcOffset = batchNum * splitIdx * srcStride; + int64_t dstOffset = batchNum * splitIdx * dstStride; + + // Calculate src and dst stride of one line + auto iterSrcStride = batchNum * GetSingleSize(); + auto iterDstStride = batchNum * GetSingleSize(); + + // Complete datacopy by line + GlobalTensor srcGlobal; + srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); + srcGlobal.SetAddr(batchOffset); + for (int32_t idx = 0; idx < iterNum; ++idx) { + if (srcStride >= UINT16_MAX) { + for (int i = 0; i < batchNum; ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->BatchCopyND2NZ( + dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, + GetSingleHeight(), GetSingleWidth(), srcDValue); + dstOffset += dstStride; + srcOffset += srcStride; + } + } else { + MATMUL_MODULE(BatchDataCopyUtils)->BatchCopyND2NZ( + dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, + GetSingleHeight(), GetSingleWidth(), + srcDValue, batchNum, srcStride, dstStride); + } + dstOffset += iterDstStride; + srcOffset += iterSrcStride; + } + } + + __aicore__ inline void UpdataBatchNum(int32_t &batchNum, int32_t &iterNum) + { + if constexpr (INPUT_TYPE::layout == LayoutMode::BSNGD) { + ASCENDC_ASSERT((IsLayoutGValid()), { + KERNEL_LOG(KERNEL_ERROR, "multi batch calculation of multiple lines of S is not supported"); + }); + // if batchNum > LayoutN * LayoutG, need copy by single line + if (batchNum > GetLayoutInfoNG()) { + // update batchnum to single line batch number + batchNum = GetLayoutInfoNG(); + iterNum = Ceil(MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum(), batchNum); + } + } + } + + __aicore__ inline int32_t GetLayoutInfoNG() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoG(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoG(); + } + } + + template + __aicore__ inline int64_t GetSingleSize() const + { + return GetSingleWidth() * GetSingleHeight(); + } + + template + __aicore__ inline int64_t GetSingleSizeAlign() const + { + // ND format not support int8 + return CeilAlign(GetSingleWidth(), c0Size_) * + CeilAlign(GetSingleHeight(), BLOCK_CUBE); + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + if constexpr (NEED_BASIC && IsBasic(MM_CFG)) { + // false: not support intraBlock, true: is basic constantized scenario + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(); + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(); + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + if constexpr (NEED_BASIC && IsBasic(MM_CFG)) { + // false: not support intraBlock, true: is basic constantized scenario + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } + } + + // ND format, src data default don't need to use constant parameters + template + __aicore__ inline int64_t GetSrcStride() + { + if constexpr (INPUT_TYPE::layout == LayoutMode::BSNGD || INPUT_TYPE::layout == LayoutMode::SBNGD) { + // BSNGD/SBNGD layout memory is not contiguous + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return CeilAlign(GetSingleWidth(), c0Size_); + } else { + return GetSingleWidth(); + } + } else { + // NORMAL/BNGS1S2 layout memory is contiguous + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return GetSingleSizeAlign(); + } else { + return GetSingleSize(); + } + } + } + + __aicore__ inline bool IsTranspose() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + } + } + + __aicore__ inline bool IsLayoutGValid() + { + auto maxLayoutInfoG = MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoG() > + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoG() ? + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoG() : + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoG(); + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() <= + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoN() * maxLayoutInfoG); + } else { + return MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() <= + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoN() * maxLayoutInfoG); + } + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); +}; + +// Specialized Template Class of Batch Matmul CopyIn +// Batch Matmul NZ Format Data CopyIn From GM/UB, only support LayoutMode NORMAL +template +class BatchCopyCubeIn::IsNeedUB()) && + GetCopyCubeInType() == CopyCubeInType::BMM && + INPUT_TYPE::format == CubeFormat::NZ && + INPUT_TYPE::layout == LayoutMode::NORMAL>> +{ + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchCopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchDataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeInfo); + + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; + + __aicore__ inline void Init() + { + if constexpr (INPUT_TYPE::isTrans) { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * + GetSingleSizeAlign(), 1); + } else { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * + GetSingleSizeAlign(), 1); + } + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose = false) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose = false) + {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + if (IsTranspose()) { + CopyBatchToCube( + dstTensor, outerIdx, splitIdx, splitSize); + } else { + CopyBatchToCube( + dstTensor, outerIdx, splitIdx, splitSize); + } + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor localTensor; + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + return localTensor; + } + + __aicore__ inline void BatchDestroy() + { + MATMUL_MODULE(CubeInBuffer)->FreeTensor(); + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) + { + return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + {} + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + +private: + template + __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, const int32_t outerIdx, + const int32_t splitIdx, const int32_t splitSize) + { + // 1. Calculate batch outer loop offset + // NZ does not support tail block scenarios,src also uses constantized data + auto alignHeight = CeilAlign(GetSingleHeight(), BLOCK_CUBE); + auto alignWidth = CeilAlign(GetSingleWidth(), c0Size_); + + // 2. Calculate src and dst stride of one step + auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() / splitSize; + int64_t srcStride = alignWidth * alignHeight; + int64_t dstStride = GetSingleSizeAlign(); + int64_t srcOffset = batchNum * splitIdx * srcStride; + int64_t dstOffset = batchNum * splitIdx * dstStride; + + // 3. loop copy NZ data by batch + bool iskRowDirec = IS_KROW && IsSameTypeV; + auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride; + GlobalTensor srcGlobal; + srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); + srcGlobal.SetAddr(batchOffset); + for (int i = 0; i < batchNum; ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->BatchCopyNZ2NZ( + dstTensor[dstOffset], srcGlobal[srcOffset], 0, 0, + alignHeight, alignWidth, alignHeight, iskRowDirec); + dstOffset += dstStride; + srcOffset += srcStride; + } + } + + template + __aicore__ inline int64_t GetSingleSizeAlign() + { + if constexpr (IS_KROW && IsSameTypeV) { + return CeilAlign(GetSingleHeight(), c0Size_) * + CeilAlign(GetSingleWidth(), c0Size_); + } else { + return CeilAlign(GetSingleHeight(), BLOCK_CUBE) * + CeilAlign(GetSingleWidth(), c0Size_); + } + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + if constexpr (NEED_BASIC && IsBasic(MM_CFG)) { + // false: not support intraBlock, true: is basic constantized scenario + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(); + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(); + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + if constexpr (NEED_BASIC && IsBasic(MM_CFG)) { + // false: not support intraBlock, true: is basic constantized scenario + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } + } + + __aicore__ inline bool IsTranspose() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + } + } +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h index 396b4bd5678027704e3394dd9ec3af5338556c22..0cc2bc69643c534c98af5b27d9f23299ee379a48 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h @@ -1,89 +1,94 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -/*! -* \file batch_copy_cube_in_intf.h -* \brief -*/ - -#ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ -#define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ - -namespace matmul { -using namespace AscendC; - -template -class BatchCopyCubeIn -{ - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; -public: - __aicore__ inline BatchCopyCubeIn() = default; - __aicore__ inline ~BatchCopyCubeIn() = default; - /** - * @description: Init of BatchCopyCubeIn - * @return: void - */ - __aicore__ inline void Init() {} - - /** - * @description: Set input global address - * @param: address: Global address input through SetTensorA or SetTensorB - * @param: srcGlobalAddr: true if input tensor is transposed - * @return: void - */ - __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) {} - - __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) {} - - __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, - const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) {} - - /** - * @description: Load input data to L1 - * @param: curRow: The row index of the matrixA/B to be loaded at current iterate - * @param: curCol: The column index of the matrixA/B to be loaded at current iterate - * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate - * @param: tileWidth: The width of the matrixA/B tiles to be loaded at current iterate - * @return: Tensor on L1 - */ - __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { - ASCENDC_ASSERT((false), { - KERNEL_LOG(KERNEL_ERROR, "Matching error. This is an empty implementation."); - }); - return NULL_TENSOR; - } - - /** - * @description: Release tensor on l1 at one compute end - * @param: tensor: The tensor on l1 need to be released - * @param: curRow: The row index of the matrixA/B at current iterate - * @param: curCol: The column index of the matrixA/B at current iterate - * @return: void - */ - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) {} - - /* - * @description: Reset buffer status used in copy in - * @return: void - */ - __aicore__ inline void Reset() {} - - /** - * @description: Destory tensor on l1 at iterate end - * @return: void - */ - __aicore__ inline void Destroy() {} - - __aicore__ inline void BatchDestroy() {} -}; - -} -#endif // _BATCH_COPY_CUBE_IN_INTF_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! +* \file batch_copy_cube_in_intf.h +* \brief +*/ + +#ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ +#define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ + +namespace Gemm { +namespace Impl { +namespace Detail { +using namespace AscendC; + +template +class BatchCopyCubeIn +{ + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; +public: + __aicore__ inline BatchCopyCubeIn() = default; + __aicore__ inline ~BatchCopyCubeIn() = default; + /** + * @description: Init of BatchCopyCubeIn + * @return: void + */ + __aicore__ inline void Init() {} + + /** + * @description: Set input global address + * @param: address: Global address input through SetTensorA or SetTensorB + * @param: srcGlobalAddr: true if input tensor is transposed + * @return: void + */ + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) {} + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) {} + + /** + * @description: Load input data to L1 + * @param: curRow: The row index of the matrixA/B to be loaded at current iterate + * @param: curCol: The column index of the matrixA/B to be loaded at current iterate + * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate + * @param: tileWidth: The width of the matrixA/B tiles to be loaded at current iterate + * @return: Tensor on L1 + */ + template + __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth, + const ScheduleContext& context = 0) + { + ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Matching error. This is an empty implementation."); }); + return NULL_TENSOR; + } + + /** + * @description: Release tensor on l1 at one compute end + * @param: tensor: The tensor on l1 need to be released + * @param: curRow: The row index of the matrixA/B at current iterate + * @param: curCol: The column index of the matrixA/B at current iterate + * @return: void + */ + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) {} + + /* + * @description: Reset buffer status used in copy in + * @return: void + */ + __aicore__ inline void Reset() {} + + /** + * @description: Destory tensor on l1 at iterate end + * @return: void + */ + __aicore__ inline void Destroy() {} + + __aicore__ inline void BatchDestroy() {} +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _BATCH_COPY_CUBE_IN_INTF_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h new file mode 100644 index 0000000000000000000000000000000000000000..674c28baaef914607c4e7daeab35427e6bdd8da5 --- /dev/null +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h @@ -0,0 +1,89 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file batch_copy_cube_in_params.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class BatchCopyCubeInParams { + using SrcT = typename INPUT_TYPE::T; + using TransT = typename INPUT_TYPE::TRANS_T; + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); +public: + __aicore__ inline uint32_t GetBatchNum() + { + return MATMUL_CAST_TO_IMPL()->batchA_; + } + + template + __aicore__ inline int32_t GetBatchOrgWidth() + { + // Get Head length of BSH or SBH layout + if constexpr (INPUT_TYPE::layout == LayoutMode::BSNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoD() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoG(); + } else if constexpr (INPUT_TYPE::layout == LayoutMode::SBNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoD() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoG() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetALayoutInfoB(); + } else { + // Some operators does not set LayoutInfoS/D parameters for NORMAL/BNGS1S2 layout + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } + } +}; + +template +class BatchCopyCubeInParams> { + using SrcT = typename INPUT_TYPE::T; + using TransT = typename INPUT_TYPE::TRANS_T; + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); +public: + __aicore__ inline uint32_t GetBatchNum() + { + return MATMUL_CAST_TO_IMPL()->batchB_; + } + + template + __aicore__ inline int32_t GetBatchOrgWidth() + { + // Get Head length of BSH or SBH layout + if constexpr (INPUT_TYPE::layout == LayoutMode::BSNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoD() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoG(); + } else if constexpr (INPUT_TYPE::layout == LayoutMode::SBNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoD() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoN() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoG() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBLayoutInfoB(); + } else { + // Some operators does not set LayoutInfoS/D parameters for NORMAL/BNGS1S2 layout + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } + } +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h new file mode 100644 index 0000000000000000000000000000000000000000..a8322e3cc4a723cc97192c0d6663d55997d47597 --- /dev/null +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h @@ -0,0 +1,474 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! + * \file batch_copy_cube_in_using_ub.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_USING_UB_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_USING_UB_H + +#include "batch_copy_cube_in_intf.h" +#include "batch_copy_cube_in_params.h" +#include "batch_data_copy_wrapper.h" +#include "../../../resource/cube_in_buffer/cube_in_buffer.h" +#include "../copy_cube_in_params.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +// Specialized Template Class of Batch Matmul CopyIn +// Batch Matmul ND Format Data CopyIn From GM, only support NORMAL +template +class BatchCopyCubeIn::IsNeedUB()) && + GetCopyCubeInType() == CopyCubeInType::BMM && + (INPUT_TYPE::format == CubeFormat::ND) && + (INPUT_TYPE::layout == LayoutMode::NORMAL) && + PhyPosIsGM(INPUT_TYPE::pos)>> +{ +private: + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchCopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchDataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; + + __aicore__ inline void Init() + { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * + CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), c0Size_) * + CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), BLOCK_CUBE), 1); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose = false) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose = false) + {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + if (IsTranspose()) { + return CopyBatchToCube( + dstTensor, matrixStride, outerIdx, splitIdx, splitSize); + } else { + return CopyBatchToCube( + dstTensor, matrixStride, outerIdx, splitIdx, splitSize); + } + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor localTensor; + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + return localTensor; + } + + __aicore__ inline void BatchDestroy() + { + MATMUL_MODULE(CubeInBuffer)->FreeTensor(); + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) + { + return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + {} + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + +private: + template + __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + // 1. calculate src stride and dst stride by db split loop index + auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() / splitSize; + auto alignWidth = CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), c0Size_); + auto alignHeight = CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), BLOCK_CUBE); + auto srcStride = GetSingleSize(); + auto dstStride = alignWidth * alignHeight; + uint64_t srcOffset = batchNum * splitIdx * srcStride; + uint64_t dstOffset = batchNum * splitIdx * dstStride; + + // 2. copy batch matrix in + int64_t batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride; + GlobalTensor srcGlobal; + srcGlobal.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); + srcGlobal.SetAddr(batchOffset + srcOffset); + if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { + CopyND2NZThroughVec( + dstTensor[dstOffset], srcGlobal, batchNum, outerIdx, splitIdx, alignHeight, alignWidth); + } else { + if constexpr (isKRow) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor[dstOffset], srcGlobal, 0, 0, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), + batchNum * alignWidth, batchNum * alignWidth); + } else { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor[dstOffset], srcGlobal, 0, 0, batchNum * alignHeight, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + } + } + } + + template + __aicore__ inline enable_if_t + CopyND2NZThroughVec(const LocalTensor& dstTensor, const GlobalTensor& srcTensor, int32_t batchNum, + int32_t batchOuterIdx, int32_t splitOuterIdx, int32_t alignHeight, int32_t alignWidth) + { + auto srcStride = GetSingleSize(); + int64_t srcOffset = 0; + int64_t dstOffset = 0; + + bool ubEnough = MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight() * c0Size_ <= + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() ? true : false; + if (ubEnough) { + event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); + auto gCol = isTrans ? MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth() : + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth(); + int colNum = Ceil(MATMUL_MODULE(CopyCubeInParams)->template GetTotalCol() * + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), c0Size_); + for (auto iterBatch = 0; iterBatch < batchNum; ++iterBatch) { + for (auto i = 0; i < colNum; ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZ( + dstTensor[dstOffset], srcTensor[srcOffset], 0, i * c0Size_, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), c0Size_, gCol); + dstOffset += alignHeight * c0Size_; + SetFlag(eventIDMte3ToMte2); + WaitFlag(eventIDMte3ToMte2); + } + srcOffset += srcStride; + } + } else { + if constexpr (isTrans) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor, srcTensor, 0, 0, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), + batchNum * alignWidth, batchNum * alignWidth); + } else { + int tail = MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth() % c0Size_; + if (tail == 0) { + for (int iterBatch = 0; iterBatch < batchNum; ++iterBatch) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor[dstOffset], srcTensor[srcOffset], 0, 0, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + dstOffset += MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight() * alignWidth; + srcOffset += srcStride; + } + } else { + for (int iterBatch = 0; iterBatch < batchNum; ++iterBatch) { + int64_t innerSrcOffset = srcOffset; + for (auto i = 0; i < MATMUL_MODULE(CopyCubeInParams)->template GetTotalRow(); ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor[dstOffset], srcTensor[innerSrcOffset], 0, 0, + GetBaseUseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + dstOffset += GetBaseUseHeight() * alignWidth; + innerSrcOffset += GetBaseUseHeight() * + MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(); + } + srcOffset += srcStride; + } + } + } + } + } + + template + __aicore__ inline enable_if_t + CopyND2NZThroughVec(const LocalTensor& dstTensor, const GlobalTensor& srcTensor, int32_t batchNum, + int32_t batchOuterIdx, int32_t splitOuterIdx, int32_t alignHeight, int32_t alignWidth) + { + auto srcStride = GetSingleSize(); + int64_t srcOffset = 0; + int64_t dstOffset = 0; + + bool ubEnough = MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight() * c0Size_ <= + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() ? true : false; + if (ubEnough) { + + auto colNum = Ceil(MATMUL_MODULE(CopyCubeInParams)->template GetTotalCol() * + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), c0Size_); + event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); + for (auto iterBatch = 0; iterBatch < batchNum; ++iterBatch) { + for (auto i = 0; i < colNum; ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZ( + dstTensor[dstOffset], srcTensor[srcOffset], 0, i * c0Size_, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), c0Size_, + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + dstOffset += alignHeight * c0Size_; + SetFlag(eventIDMte3ToMte2); + WaitFlag(eventIDMte3ToMte2); + } + if constexpr (isTrans) { + auto tail = MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth() % c0Size_; + if (tail != 0) { + MATMUL_MODULE(BatchDataCopyUtils)->BatchCopyND2NZ( + dstTensor[dstOffset], srcTensor[srcOffset], 0, colNum * c0Size_, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), tail, + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + dstOffset += alignHeight * c0Size_; + SetFlag(eventIDMte3ToMte2); + WaitFlag(eventIDMte3ToMte2); + } + } + srcOffset += srcStride; + } + } else { + if constexpr (isTrans) { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor, srcTensor, 0, 0, batchNum * alignHeight, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth()); + } else { + MATMUL_MODULE(BatchDataCopyUtils)->CopyND2NZOnTheFly( + dstTensor, srcTensor, 0, 0, + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), + batchNum * alignWidth, batchNum * alignWidth); + } + } + } + + __aicore__ inline bool IsTranspose() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + } + } + + __aicore__ inline int32_t GetSingleSize() const + { + // not support constantization + return MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth() * + MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(); + } + + template + __aicore__ constexpr enable_if_t GetBaseUseHeight() const + { + if constexpr (isTrans) { + return MATMUL_CONST_PARAM_VAR.baseUseK_; + } else { + return MATMUL_CONST_PARAM_VAR.baseUseM_; + } + } + + template + __aicore__ constexpr enable_if_t GetBaseUseHeight() const + { + if constexpr (IS_TRANS) { + return MATMUL_CONST_PARAM_VAR.baseUseN_; + } else { + return MATMUL_CONST_PARAM_VAR.baseUseK_; + } + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); +}; + +// Specialized Template Class of Batch Matmul CopyIn +// Batch Matmul NZ Format Data CopyIn From GM/UB, only support NORMAL +template +class BatchCopyCubeIn::IsNeedUB()) && + GetCopyCubeInType() == CopyCubeInType::BMM && + (INPUT_TYPE::format == CubeFormat::NZ) && + (INPUT_TYPE::layout == LayoutMode::NORMAL) && + (PhyPosIsUB(INPUT_TYPE::pos) || PhyPosIsGM(INPUT_TYPE::pos))>> +{ + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchCopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(BatchDataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeInfo); + + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + inline __aicore__ BatchCopyCubeIn() = default; + inline __aicore__ ~BatchCopyCubeIn() = default; + + __aicore__ inline void Init() + { + if constexpr (INPUT_TYPE::isTrans) { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * + GetSingleSizeAlign(), 1); + } else { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * + GetSingleSizeAlign(), 1); + } + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose = false) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose = false) + {} + + __aicore__ inline void BatchLoad(LocalTensor& dstTensor, const uint32_t matrixStride, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + if (IsTranspose()) { + CopyBatchToCube( + dstTensor, outerIdx, splitIdx, splitSize); + } else { + CopyBatchToCube( + dstTensor, outerIdx, splitIdx, splitSize); + } + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor localTensor; + localTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + return localTensor; + } + + __aicore__ inline void BatchDestroy() + { + MATMUL_MODULE(CubeInBuffer)->FreeTensor(); + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline LocalTensor AllocTensor(int32_t iterIndex = 0) + { + return MATMUL_MODULE(CubeInBuffer)->AllocTensor(iterIndex); + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + {} + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + +private: + template + __aicore__ inline void CopyBatchToCube(LocalTensor& dstTensor, + const int32_t outerIdx, const int32_t splitIdx, const int32_t splitSize) + { + // 1. Calculate batch outer loop offset + auto alignHeight = CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), BLOCK_CUBE); + auto alignWidth = CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), c0Size_); + auto batchNum = MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() / splitSize; + bool iskRowDirec = isKRow && IsSameTypeV; + + // 2. Calculate src and dst stride of one step + auto srcStride = alignWidth * alignHeight; + auto dstStride = GetSingleSizeAlign(); + int64_t srcOffset = batchNum * splitIdx * srcStride; + int64_t dstOffset = batchNum * splitIdx * dstStride; + + // 3. set input srctensor addr + auto batchOffset = outerIdx * MATMUL_MODULE(BatchCopyCubeInParams)->GetBatchNum() * srcStride; + using TensorType = + typename AscendC::Conditional, LocalTensor>::type; + TensorType srcTensor; + if constexpr (PhyPosIsGM(INPUT_TYPE::pos)) { + srcTensor.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor().address_); + srcTensor.SetAddr(batchOffset); + } else { + srcTensor.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + srcTensor = srcTensor[batchOffset]; + } + + // 4. loop copy NZ data by batch + for (auto i = 0; i < batchNum; ++i) { + MATMUL_MODULE(BatchDataCopyUtils)->BatchCopyNZ2NZ( + dstTensor[dstOffset], srcTensor[srcOffset], 0, 0, + alignHeight, alignWidth, alignHeight, iskRowDirec); + dstOffset += dstStride; + srcOffset += srcStride; + } + } + + template + __aicore__ inline int32_t GetSingleSizeAlign() + { + if constexpr (isKRow && IsSameTypeV) { + return CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), c0Size_) * + CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), c0Size_); + } else { + return CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleHeight(), BLOCK_CUBE) * + CeilAlign(MATMUL_MODULE(CopyCubeInParams)->template GetSingleWidth(), c0Size_); + } + } + + __aicore__ inline bool IsTranspose() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + } + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_USING_UB_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..0515326ac069f748523cfcb0ed71493ee203fa08 --- /dev/null +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h @@ -0,0 +1,821 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +/*! + * \file batch_data_copy_wrapper.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_BATCH_DATA_COPY_WRAPPER_H +#define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_BATCH_DATA_COPY_WRAPPER_H + +#include "../../../matmul_module.h" +#include "../../../matmul_param.h" +#include "../copy_cube_in_utils.h" +#include "../copy_cube_in_params.h" + +namespace Gemm { +namespace Impl { +namespace Detail { + +using namespace AscendC; + +template +class BatchDataCopyWrapper +{ + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulUserDefineInfo); + MATMUL_USE_MODULE(LocalWorkspace); + + template + __aicore__ constexpr enable_if_t GetStaticTileHeight() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreM(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(); + } + } + + template + __aicore__ constexpr enable_if_t GetStaticTileWidth() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(); + } + } + + template + __aicore__ inline enable_if_t GetStaticTileHeight() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(); + } + } + + template + __aicore__ inline enable_if_t GetStaticTileWidth() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(); + } + } + +public: + __aicore__ inline BatchDataCopyWrapper() = default; + __aicore__ inline ~BatchDataCopyWrapper() = default; + + __aicore__ inline void BatchCopyND2NZ(const LocalTensor& dst, const GlobalTensor& src, const int row, + const int col, const int height, const int width, const int gCol, + const int ndNum = 1, const int srcNdMatrixStride = 0, + const int dstNzMatrixStride = 0, const bool kAlignToC0Size = false) + { +#ifdef ASCENDC_CPU_DEBUG + if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); + } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); +#else + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); + } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); +#endif + } else { +#if __CCE_AICORE__ >= 220 + CopyND2NZ(dst, src, row, col, height, width, gCol, ndNum, srcNdMatrixStride, dstNzMatrixStride, + kAlignToC0Size); +#endif + } + } + + __aicore__ inline void BatchCopyNZ2NZ(const LocalTensor& dst, const LocalTensor& src, int row, + int col, int height, int width, int gRow, bool kAlignToC0Size = false) + { + CopyNZ2NZ(dst, src, row, col, height, width, gRow); + } + + __aicore__ inline void BatchCopyNZ2NZ(const LocalTensor& dst, const GlobalTensor& src, + const int row, const int col, const int height, const int width, + const int gRow, const bool kAlignToC0Size = false) + { +#ifdef ASCENDC_CPU_DEBUG + if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); + } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); +#else + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); + } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { + LocalTensor a1Tmp = dst.template ReinterpretCast(); + (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, + MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); +#endif + } else { + CopyNZ2NZ(dst, src, row, col, height, width, gRow, kAlignToC0Size); + } + } + +#if __CCE_AICORE__ < 220 + __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor& dst, const GlobalTensor& src, int row, + int col, int height, int width, int gCol) + { + ASSERT(gCol >= width && "Copy ND block gm->ub width larger than origin matrix width."); + int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero + int dstOffset = 0; + int64_t srcOffset = (static_cast(row) * static_cast(gCol) + static_cast(col)); + int calcWidthExr = Ceil(width, c0Size_); + int calcHeightExr = Ceil(height, BLOCK_CUBE); + +#if __CCE_AICORE__ == 200 + // set2d, pad tail zero + if (height % BLOCK_CUBE != 0) { + int64_t repeat = calcWidthExr * calcHeightExr; + if constexpr (IsSameType::value) { + LocalTensor tmp = dst.template ReinterpretCast(); + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = static_cast(repeat); + initConstValueParams.initValue = 0; + InitConstValue(tmp, initConstValueParams); + } else { + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = static_cast(repeat); + initConstValueParams.initValue = 0; + InitConstValue(dst, initConstValueParams); + } + PipeBarrier(); + } +#endif + + // gCol unaligned, can not use dma copy repeat stride + int tail = width % c0Size_; + if (tail) { + // tail elements that need to be pad zero + int blockLen = calcWidthExr * (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); + + // gm->l1 + int src_gap = gCol * sizeof(SrcT) / ONE_BLK_SIZE - 1; + if (gCol % c0Size_ || src_gap >= UINT16_MAX) { + // each block len is only 32B + for (auto i = 0; i < calcWidth; i++) { + for (auto j = 0; j < height; j++) { + DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_], + src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 }); + } + } + } else { + // data copy stride is aligned + for (auto i = 0; i < calcWidth; i++) { + DataCopy(dst[dstOffset], src[srcOffset], + { static_cast(height), 1, static_cast(src_gap), 0 }); + dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; + srcOffset += c0Size_; + } + } + + // tail gm->ub pad zero, and then ub->l1 + int32_t tileHeight; + if (IsTranspose()) { + tileHeight = GetStaticTileHeight(); + } else { + tileHeight = GetStaticTileHeight(); + } + auto size = tileHeight * ONE_BLK_SIZE / sizeof(SrcT); + + LocalTensor trans; + trans = MATMUL_MODULE(LocalWorkspace)->GetND2NZWorkspace(0).template ReinterpretCast(); + trans.SetSize(size); + + int64_t tailSrcoffset = (int64_t)row * (int64_t)gCol + (int64_t)col + (int64_t)calcWidth * (int64_t)c0Size_; + + // gm->ub + for (auto i = 0; i < height; ++i) { + DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 }); + tailSrcoffset += gCol; + } + + event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventIDMte2ToV); + WaitFlag(eventIDMte2ToV); + + // tail pad zero + uint64_t mask[2]; + if constexpr (IsSameType::value) { + tail = Ceil(tail, 2); + } + uint16_t mask_tail = ~((1 << tail) - 1); + uint64_t masktail = mask_tail; + mask[0] = masktail + (masktail << 16) + (masktail << 32) + (masktail << 48); + mask[1] = mask[0]; + if (masktail != 0) { + if constexpr (IsSameType::value) { + LocalTensor tmpTrans = trans.template ReinterpretCast(); + Duplicate(tmpTrans, static_cast(0), mask, Ceil(height, 8), 1, 8); + } else { + Duplicate(trans, static_cast(0), mask, Ceil(height, 8), 1, 8); + } + } + + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + + // ub->l1 + int heightAlignBlock = Ceil(height, BLOCK_CUBE); + int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth; + DataCopy(dst[tailDstOffset], trans, { static_cast(height), 1, 0, 0 }); + } else { + int src_gap = gCol * sizeof(SrcT) / ONE_BLK_SIZE - 1; + if (gCol % c0Size_ != 0 || src_gap >= UINT16_MAX) { + int64_t oriSrcOffset = srcOffset; + int oriDstOffset = dstOffset; + // each block len is only 32B + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < height; j++) { + DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 }); + dstOffset += c0Size_; + srcOffset += gCol; + } + srcOffset = oriSrcOffset + (i + 1) * c0Size_; + dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_; + } + } else { + // data copy stride is aligned + if constexpr (INPUT_TYPE::layout == LayoutMode::NORMAL) { + int32_t loop = height / MAX_BLOCK_COUNT_SIZE; + int32_t loopTail = height % MAX_BLOCK_COUNT_SIZE; + for (int i = 0; i < calcWidth; i++) { + int32_t dstOffsetTmp = dstOffset; + int32_t srcOffsetTmp = srcOffset; + for (int i = 0; i < loop; ++i) { + DataCopy( + dst[dstOffsetTmp], src[srcOffsetTmp], + { static_cast(MAX_BLOCK_COUNT_SIZE), 1, static_cast(src_gap), 0 }); + dstOffsetTmp += MAX_BLOCK_COUNT_SIZE * c0Size_; + srcOffsetTmp += MAX_BLOCK_COUNT_SIZE * gCol; + } + if (loopTail) { + DataCopy(dst[dstOffsetTmp], src[srcOffsetTmp], + { static_cast(loopTail), 1, static_cast(src_gap), 0 }); + } + dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; + srcOffset += c0Size_; + } + } else { + for (int i = 0; i < calcWidth; i++) { + DataCopy(dst[dstOffset], src[srcOffset], + { static_cast(height), 1, static_cast(src_gap), 0 }); + dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; + srcOffset += c0Size_; + } + } + } + event_t eventIDMte2ToMte1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1)); + SetFlag(eventIDMte2ToMte1); + WaitFlag(eventIDMte2ToMte1); + event_t eventIDMte1ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2)); + SetFlag(eventIDMte1ToMte2); + WaitFlag(eventIDMte1ToMte2); + } + } + + __aicore__ inline void CopyND2NZ(const LocalTensor& dst, const GlobalTensor& src, int row, int col, + int height, int width, int gCol, int ndNum = 1, bool kAlignToC0Size = false) + { + LocalTensor transTensor; + transTensor = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset(0).template ReinterpretCast(); + transTensor.SetSize(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()); + LocalTensor trans; + trans = MATMUL_MODULE(LocalWorkspace)->GetWorkspaceWithOffset( + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()).template ReinterpretCast(); + trans.SetSize(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()); + + auto srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col); + + bool isBankConflict = Ceil(width, c0Size_) * 32 % 512 == 0 && Ceil(width, c0Size_) < 32 ? true : false; + + int calcHigh = Ceil(height, BLOCK_CUBE); + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE2); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); + int calcWidth = CopyNDBlock(transTensor, src, srcOffset, height, width, gCol, isBankConflict); + int padWidth = isBankConflict ? calcWidth + 1 : calcWidth; + int size = calcHigh * padWidth * BLOCK_CUBE * c0Size_ / AuxGetFactor(); + ; + + transTensor.SetSize(size); + trans.SetSize(size); + const_cast&>(dst).SetSize(size); + + NDPadZeros(transTensor, height, padWidth, gCol, width, isBankConflict); + NDTrans2NZ(trans, transTensor, calcHigh, calcWidth, isBankConflict); + + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + DataCopy(dst, trans, size); + enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); + }; +#endif + +private: +#if __CCE_AICORE__ < 220 + int32_t orgHeight_; // or M + int32_t orgWidth_; // or K + int32_t baseHeight_; // or baseK + int32_t baseWidth_; // or baseM + int32_t stepCol_; +#endif + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline bool IsTranspose() + { + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeA(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->IsTransposeB(); + } + } + + __aicore__ inline void CopyNZ2NZ(const LocalTensor& dst, const GlobalTensor& src, const int32_t row, + const int32_t col, const int32_t height, const int32_t width, const int32_t gRow, + const bool kAlignToC0Size = false) + { + ASCENDC_ASSERT((gRow >= height), { + KERNEL_LOG( + KERNEL_ERROR, + "NZ2NZ height larger than origin matrix height, gRow is %d, which should be no less than height %d.", + gRow, height); + }); + int32_t alignedGRow = Ceil(gRow, BLOCK_CUBE) * BLOCK_CUBE; + int64_t srcOffset = (int64_t)row * (int64_t)c0Size_ + (int64_t)col * (int64_t)alignedGRow; + // height direction need to be 16 aligned + auto alignHeight = Ceil(height, BLOCK_CUBE) * BLOCK_CUBE; + int32_t blockLen = alignHeight * c0Size_ * sizeof(TransT) / ONE_BLK_SIZE; + int32_t srcStride = (alignedGRow - alignHeight) * (c0Size_ * sizeof(TransT) / ONE_BLK_SIZE); + if constexpr (IsSameTypeV) { + blockLen /= INT4_TWO; + srcStride /= INT4_TWO; + } + if (srcStride >= UINT16_MAX) { + for (int32_t i = 0; i < Ceil(width, c0Size_); ++i) { + DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_], + { 1, static_cast(blockLen), 0, 0 }); + } + } else { + uint16_t nburst = Ceil(width, c0Size_); + int32_t dstStride = 0; + if constexpr (IsSameTypeV) { + if (kAlignToC0Size) { + auto alignHeightC0Size = Ceil(height, c0Size_) * c0Size_; + dstStride = alignHeightC0Size - alignHeight; + } + } + DataCopy(dst, src[srcOffset], + { nburst, static_cast(blockLen), static_cast(srcStride), + static_cast(dstStride) }); + } + }; + + __aicore__ inline void CopyNZ2NZ(const LocalTensor& dst, const LocalTensor& src, const int32_t row, + const int32_t col, const int32_t height, const int32_t width, const int32_t gRow) + { + ASCENDC_ASSERT((gRow >= height), { + KERNEL_LOG(KERNEL_ERROR, "gRow is %d, which should be no less than height %d.", gRow, height); + }); + int32_t srcOffset = row * c0Size_ + col * gRow; + // height direction need to be 16 aligned + auto alignHeight = (height + 15) / 16 * 16; + int32_t blockLen = alignHeight * c0Size_ * sizeof(TransT) / ONE_BLK_SIZE; + int32_t srcStride = (gRow - alignHeight) * (c0Size_ * sizeof(TransT) / ONE_BLK_SIZE); + + if (srcStride >= UINT16_MAX) { + for (int32_t i = 0; i < width / c0Size_; ++i) { + DataCopy(dst[i * alignHeight * c0Size_], src[srcOffset + i * gRow * c0Size_], + { 1, static_cast(blockLen), 0, 0 }); + } + } else { + DataCopy(dst, src[srcOffset], + { static_cast(width / c0Size_), static_cast(blockLen), + static_cast(srcStride), 0 }); + } + }; + +#if __CCE_AICORE__ < 220 + template + __aicore__ inline int CopyNDBlock(const LocalTensor& transTensor, const GlobalTensor& src, int64_t srcOffset, + int height, int width, int gCol, bool isBankConflict) + { + ASCENDC_ASSERT((gCol >= width), + { KERNEL_LOG(KERNEL_ERROR, "gCol is %d, which should be no less than %d.", gCol, width); }); + int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero + int c0Size = B16_C0SIZE; + if constexpr (sizeof(T) == sizeof(float)) { + c0Size = B32_C0SIZE; + } else if (sizeof(T) == sizeof(int8_t)) { + c0Size = B8_C0SIZE; + } + + // gCol unaligned + if (gCol % c0Size) { + calcWidth = Ceil(CeilAlign(width, c0Size), c0Size_); + int blockLen = CeilAlign(width, c0Size) * sizeof(T) / DEFAULT_C0_SIZE; + int dstOffset = 0; + int BankConflictPadSize = isBankConflict ? (32 / sizeof(T)) : 0; + + // data copy stride is unaligned, need to copy line by line + for (int i = 0; i < height; i++) { + DataCopy(transTensor[dstOffset], src[srcOffset], { 1, static_cast(blockLen), 0, 0 }); + dstOffset += (CeilAlign(width, c0Size) + BankConflictPadSize); + srcOffset += gCol; + } + + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V); + SetFlag((event_t)enQueEvtID); + WaitFlag((event_t)enQueEvtID); + } else { + int srcStride = (gCol - width) * sizeof(T) / ONE_BLK_SIZE; + int blocklen = Ceil(width * sizeof(T), ONE_BLK_SIZE); + calcWidth = Ceil(CeilAlign(width, c0Size), c0Size_); + if (srcStride >= UINT16_MAX) { + int dstOffset = isBankConflict ? (width + c0Size) : width; + for (int i = 0; i < height; ++i) { + DataCopy(transTensor[i * dstOffset], src[srcOffset], { 1, static_cast(blocklen), 0, 0 }); + srcOffset += gCol; + } + } else { + uint16_t dstStride = isBankConflict ? 1 : 0; + int loopNum = Ceil(static_cast(height), MAX_BLOCK_COUNT_SIZE); + int tailCount = static_cast(height) % MAX_BLOCK_COUNT_SIZE; + for (int i = 0; i < loopNum; ++i) { + uint16_t blockCount = (i == loopNum - 1) ? tailCount : MAX_BLOCK_COUNT_SIZE; + DataCopy( + transTensor[i * MAX_BLOCK_COUNT_SIZE * blocklen * ONE_BLK_SIZE / sizeof(T)], + src[srcOffset + i * MAX_BLOCK_COUNT_SIZE * blocklen * ONE_BLK_SIZE / sizeof(T)], + { blockCount, static_cast(blocklen), static_cast(srcStride), dstStride }); + } + } + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V); + SetFlag((event_t)enQueEvtID); + WaitFlag((event_t)enQueEvtID); + } + return calcWidth; + } + + template + __aicore__ inline void NDPadZeros(LocalTensor& dst, int height, int calcWidth, int gCol, int width, + bool isBankConflict) + { + if (gCol % BLOCK_CUBE) { + int tail = width % c0Size_; + // tail pad zero + if (tail) { + auto offset = width / c0Size_ * c0Size_; + uint64_t mask[2]; + if constexpr (IsSameType::value) { + tail = Ceil(tail, 2); + offset /= 2; + } + uint16_t mask_tail = ~((1 << tail) - 1); + uint64_t masktail = mask_tail; + mask[0] = masktail + (masktail << 16) + (masktail << 32) + (masktail << 48); + mask[1] = mask[0]; + int stride = calcWidth * (c0Size_ * sizeof(T) / DEFAULT_C0_SIZE); + int32_t totalRep = Ceil(height, 8); + if (masktail != 0) { + if constexpr (IsSameType::value) { + LocalTensor tmpTransTensor = dst.template ReinterpretCast(); + if (stride < 32) { + if (totalRep <= MAX_REPEAT_TIMES) { + Duplicate(tmpTransTensor[offset], (int16_t)0, mask, Ceil(height, 8), stride, + 8 * stride); + } else { + int32_t highBlock = totalRep / MAX_REPEAT_TIMES; + int32_t highTail = totalRep % MAX_REPEAT_TIMES; + int64_t dstOffset = calcWidth * BLOCK_CUBE * 8 * MAX_REPEAT_TIMES; + for (int32_t idx = 0; idx < highBlock; ++idx) { + Duplicate(tmpTransTensor[offset], (int16_t)0, mask, MAX_REPEAT_TIMES, stride, + 8 * stride); + offset += dstOffset; + } + if (highTail) { + Duplicate(tmpTransTensor[offset], (int16_t)0, mask, highTail, stride, 8 * stride); + } + } + } else { + for (int32_t i = 0; i < totalRep; ++i) { + Duplicate(tmpTransTensor[offset], (int16_t)0, mask, 1, stride, 0); + offset += stride * BLOCK_CUBE; + } + } + } else { + Duplicate(dst[offset], (T)0, mask, totalRep, stride, 8 * stride); + } + PipeBarrier(); + } + } + } + // If the value of high is not an integer multiple of 16, add 0. + int tailHigh = height % BLOCK_CUBE; + if (tailHigh) { + auto dstOffset = height * calcWidth * BLOCK_CUBE; + if constexpr (IsSameType::value) { + LocalTensor tmpDst = dst.template ReinterpretCast(); + Duplicate(tmpDst[dstOffset], (int16_t)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); + } else { + Duplicate(dst[dstOffset], (T)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); + } + } + } + + __aicore__ inline void NDTrans2NZ(LocalTensor& dst, LocalTensor& src, int calcHigh, int calcWidth, + bool isBankConflict) + { + // Use Muls, convert to NZ format + if constexpr (IsSameType::value) { + struct UnaryRepeatParams intriParams; + uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) }; + int blkStride = isBankConflict ? calcWidth + 1 : calcWidth; + intriParams.dstBlkStride = (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); + intriParams.srcBlkStride = blkStride * (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); + intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM; + intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM; + int dstOffset = 0; + int srcOffset = 0; + // ensure rep stride be less than 256 + constexpr int maxSrcBlkStride = 32; + LocalTensor tmpSrc = src.template ReinterpretCast(); + LocalTensor tmpDst = dst.template ReinterpretCast(); + if (intriParams.srcBlkStride >= maxSrcBlkStride) { + intriParams.dstBlkStride = 1; + intriParams.srcBlkStride = 1; + mask[0] = (1 << BLOCK_CUBE) - 1; + mask[1] = 0; + SetVectorMask(mask[1], mask[0]); + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE; + srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE; + Muls(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, 1, intriParams); + } + } + } else { + SetVectorMask(mask[1], mask[0]); + int32_t totalRepTimes = 2 * calcHigh; + int32_t highBlock = totalRepTimes / MAX_REPEAT_TIMES; + int32_t highTail = totalRepTimes % MAX_REPEAT_TIMES; + for (int i = 0; i < calcWidth; i++) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE; + srcOffset = i * BLOCK_CUBE; + for (int32_t idx = 0; idx < highBlock; ++idx) { + Muls(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES, + intriParams); + dstOffset += BLOCK_CUBE * MAX_REPEAT_TIMES * 8; + srcOffset += calcWidth * BLOCK_CUBE * MAX_REPEAT_TIMES * 8; + } + if (highTail) { + Muls(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, highTail, + intriParams); + } + } + } + } else { + const int c0Count = AscendCUtils::GetC0Count(sizeof(SrcT)); + struct UnaryRepeatParams intriParams; + uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) }; + int32_t padBlock = 1; + if constexpr (IsSameTypeV && IsSameTypeV) { + padBlock = 2; + } + int blkStride = isBankConflict ? calcWidth + padBlock : calcWidth; + intriParams.dstBlkStride = (BLOCK_CUBE * sizeof(SrcT) / DEFAULT_C0_SIZE); + intriParams.srcBlkStride = blkStride * BLOCK_CUBE * sizeof(SrcT) / DEFAULT_C0_SIZE; + intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM; + intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM; + int dstOffset = 0; + int srcOffset = 0; + // ensure rep stride be less than 256 + constexpr int maxSrcBlkStride = 32; + if (intriParams.srcBlkStride >= maxSrcBlkStride) { + intriParams.dstBlkStride = 1; + intriParams.srcBlkStride = 1; + mask[0] = (1 << BLOCK_CUBE) - 1; + mask[1] = 0; + SetVectorMask(mask[1], mask[0]); + for (int i = 0; i < calcWidth; i++) { + for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE; + srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE; + Muls(dst[dstOffset], src[srcOffset], (SrcT)1, mask, 1, intriParams); + if constexpr (sizeof(SrcT) == sizeof(float)) { + Muls(dst[dstOffset + c0Count], src[srcOffset + c0Count], (SrcT)1, mask, 1, + intriParams); + } + } + } + } else { + SetVectorMask(mask[1], mask[0]); + for (int i = 0; i < calcWidth; i++) { + dstOffset = i * calcHigh * CUBE_MAX_SIZE; + srcOffset = i * BLOCK_CUBE; + Muls(dst[dstOffset], src[srcOffset], (SrcT)1, mask, 2 * calcHigh, intriParams); + if constexpr (sizeof(SrcT) == sizeof(float)) { + Muls(dst[dstOffset + c0Count], src[srcOffset + c0Count], (SrcT)1, mask, + 2 * calcHigh, intriParams); + } + } + } + } + } + +#endif + +#if __CCE_AICORE__ >= 220 + template + __aicore__ inline void StaticPadNd2Nz(const LocalTensor& dst, const int32_t staticHeight, + const int32_t staticWidth, const int32_t tileHeight, const int32_t tileWidth) + { + if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) { + int32_t tileWidthC0 = Ceil(tileWidth, c0Size_); + int32_t staticWidthC0 = Ceil(staticWidth, c0Size_); + // pad left bottom area of src. + if (tileHeight < staticHeight) { + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = tileWidthC0; + initConstValueParams.blockNum = staticHeight - tileHeight; + initConstValueParams.dstGap = tileHeight; + initConstValueParams.initValue = 0; + InitConstValue(dst[tileHeight * c0Size_], initConstValueParams); + } + // pad right area of src + if (tileWidthC0 < staticWidthC0) { + InitConstValueParams initConstValueParams; + initConstValueParams.repeatTimes = 1; + initConstValueParams.blockNum = (staticWidthC0 - tileWidthC0) * staticHeight; + initConstValueParams.dstGap = 0; + initConstValueParams.initValue = 0; + InitConstValue(dst[tileWidthC0 * staticHeight * c0Size_], initConstValueParams); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + using params = InitConstValueParams; + InitConstValue(dst, + params{ 1, static_cast(staticHeight * staticWidth * sizeof(DataType) / ONE_BLK_SIZE), 0, 0 }); + } + } + + __aicore__ inline void CopyND2NZ(const LocalTensor& dst, const GlobalTensor& src, const int32_t row, + const int32_t col, const int32_t height, const int32_t width, const int32_t gCol, + const int32_t ndNum = 1, const int32_t srcNdMatrixStride = 0, + const int32_t dstNzMatrixStride = 0, const bool kAlignToC0Size = false) + { + ASCENDC_ASSERT((row >= 0), { KERNEL_LOG(KERNEL_ERROR, "row is %d, which should be no less than 0.", row); }); + ASCENDC_ASSERT((col >= 0), { KERNEL_LOG(KERNEL_ERROR, "col is %d, which should be no less than 0.", col); }); + ASCENDC_ASSERT((height > 0), + { KERNEL_LOG(KERNEL_ERROR, "height is %d, which should be no less than 0.", height); }); + ASCENDC_ASSERT((width > 0), + { KERNEL_LOG(KERNEL_ERROR, "width is %d, which should be no less than 0.", width); }); + ASCENDC_ASSERT((gCol >= width), { + KERNEL_LOG( + KERNEL_ERROR, + "ND2NZ width larger than origin matrix width, gCol is %d, which should be no less than width %d.", gCol, + width); + }); + int32_t dstNzC0Stride = 0; + if constexpr (IsStaticPaddingEnable(MM_CFG)) { + int32_t tileHeight = GetStaticTileHeight(); + int32_t tileWidth = GetStaticTileWidth(); + if (tileHeight != height || tileWidth != width) { + StaticPadNd2Nz(dst, tileHeight, tileWidth, height, width); + dstNzC0Stride = tileHeight; + } + } + int64_t srcOffset; + if constexpr (IsSameTypeV) { + srcOffset = ((int64_t)row * (int64_t)gCol * INT4_TWO + (int64_t)col); + } else { + srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col); + } + Nd2NzParams nd2nzParams; + nd2nzParams.ndNum = ndNum; + nd2nzParams.nValue = height; + nd2nzParams.dValue = width; + nd2nzParams.srcNdMatrixStride = srcNdMatrixStride; + nd2nzParams.srcDValue = gCol; + + if (dstNzC0Stride) { + nd2nzParams.dstNzC0Stride = dstNzC0Stride; + } else { + // when k is row(height) axis, int8 type gm->l1 nd2nz should be aligned to 32(c0Size) + // while float/half type should be aligned to 16 + if (kAlignToC0Size) { + nd2nzParams.dstNzC0Stride = Ceil(height, c0Size_) * c0Size_; + } else { + nd2nzParams.dstNzC0Stride = Ceil(height, BLOCK_CUBE) * BLOCK_CUBE; + } + } + nd2nzParams.dstNzNStride = 1; + nd2nzParams.dstNzMatrixStride = dstNzMatrixStride; +#if __CCE_AICORE__ == 220 + if constexpr (!ToMatmulConfig(MM_CFG).intrinsicsCheck) { + DataCopy(dst, src[srcOffset], nd2nzParams); + } else { + if (gCol >= UINT16_MAX) { + nd2nzParams.nValue = 1; + nd2nzParams.srcDValue = width; + for (int32_t i = 0; i < height; ++i) { + DataCopy(dst[i * c0Size_], src[srcOffset + gCol * i], nd2nzParams); + } + } else { + DataCopy(dst, src[srcOffset], nd2nzParams); + } + } +#else + DataCopy(dst, src[srcOffset], nd2nzParams); // stride scope has increased +#endif + } + +#endif +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_BATCH_DATA_COPY_WRAPPER_H diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h index 6810ad8d00fe50957de9b8b4b04f6457dd24a40d..72a70024fb1bf18dfe58db10b1ad15f2bcde159a 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h @@ -1,72 +1,81 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_in_from_l1.h - * \brief - */ - - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_FROM_L1_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_FROM_L1_H - -#include "data_copy_wrapper.h" -#include "copy_cube_in_intf.h" - -namespace matmul { -template -class CopyCubeIn() == CopyCubeInType::FROM_L1>> -{ - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; - -public: - __aicore__ inline CopyCubeIn() = default; - __aicore__ inline ~CopyCubeIn() = default; - - __aicore__ inline void Init() {} - - __aicore__ inline void Reset() {} - - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose) - { - MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); - } - - __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) - { - MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); - } - - __aicore__ inline LocalTensor LoadData( - int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) - { - LocalTensor l1; - l1.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); - return l1; - } - - __aicore__ inline void AllocTensor(int32_t iterIndex = 0) {} - - __aicore__ inline void BatchLoad(const uint32_t matrixStride, const int32_t outerIdx, - const int32_t splitIdx, const int32_t splitSize) {} - - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) {} - - __aicore__ inline void Destroy() {} - - __aicore__ inline void BatchDestroy() {} -}; -} // namespace matmul -#endif // _COPY_CUBE_IN_FROM_L1_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_in_from_l1.h + * \brief + */ + + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_FROM_L1_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_FROM_L1_H + +#include "data_copy_wrapper.h" +#include "copy_cube_in_intf.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeIn() == CopyCubeInType::FROM_L1>> +{ + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + + __aicore__ inline void Init() {} + + __aicore__ inline void Reset() {} + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetLocalTensor(localMatrix, isTranspose); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor l1; + l1.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); + return l1; + } + + __aicore__ inline void AllocTensor(int32_t iterIndex = 0) {} + + __aicore__ inline void BatchLoad(const uint32_t matrixStride, const int32_t outerIdx, + const int32_t splitIdx, const int32_t splitSize) {} + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) {} + + __aicore__ inline void Destroy() {} + + __aicore__ inline void BatchDestroy() {} +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _COPY_CUBE_IN_FROM_L1_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h index 935eb10e72cd9191b434b10eb696bfe65290f72e..313882a65f5cc648ee9665aaecdfe52997e6968c 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h @@ -1,87 +1,98 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file copy_cube_in_intf.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ - -namespace matmul { -template -class CopyCubeIn -{ - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; -public: - __aicore__ inline CopyCubeIn() = default; - __aicore__ inline ~CopyCubeIn() = default; - /** - * @description: Init of CopyCubeIn - * @return: void - */ - __aicore__ inline void Init() {} - - /** - * @description: Set input local address - * @param: address: Local address input through SetTensorA or SetTensorB - * @param: isTranspose: true if input tensor is transposed - * @return: void - */ - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose) {} - - /** - * @description: Set input global address - * @param: address: Global address input through SetTensorA or SetTensorB - * @param: srcGlobalAddr: true if input tensor is transposed - * @return: void - */ - __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) {} - - /** - * @description: Load input data to L1 - * @param: curRow: The row index of the matrixA/B to be loaded at current iterate - * @param: curCol: The column index of the matrixA/B to be loaded at current iterate - * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate - * @param: tileWidth: The width of the matrixA/B tiles to be loaded at current iterate - * @return: Tensor on L1 - */ - __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth) { - ASCENDC_ASSERT((false), { - KERNEL_LOG(KERNEL_ERROR, "Matching error. This is an empty implementation."); - }); - return NULL_TENSOR; - } - - /** - * @description: Release tensor on l1 at one compute end - * @param: tensor: The tensor on l1 need to be released - * @param: curRow: The row index of the matrixA/B at current iterate - * @param: curCol: The column index of the matrixA/B at current iterate - * @return: void - */ - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) {} - - /* - * @description: Reset buffer status used in copy in - * @return: void - */ - __aicore__ inline void Reset() {} - - /** - * @description: Destory tensor on l1 at iterate end - * @return: void - */ - __aicore__ inline void Destroy() {} -}; - -} -#endif // _COPY_CUBE_IN_INTF_H_ \ No newline at end of file +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file copy_cube_in_intf.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ + +namespace Gemm { +namespace Impl { +namespace Detail { +using namespace AscendC; +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeIn +{ + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; +public: + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + /** + * @description: Init of CopyCubeIn + * @return: void + */ + __aicore__ inline void Init() {} + + /** + * @description: Set input local Tensor + * @param: leftMatrix: Local Tensor input through SetTensorA or SetTensorB + * @param: isTranspose: true if input tensor is transposed + * @return: void + */ + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose) {} + + /** + * @description: Set input global address + * @param: gm: Global Tensor input through SetTensorA or SetTensorB + * @param: srcGlobalAddr: true if input tensor is transposed + * @return: void + */ + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) {} + + /** + * @description: Load input data to L1 + * @param: curRow: The row index of the matrixA/B to be loaded at current iterate + * @param: curCol: The column index of the matrixA/B to be loaded at current iterate + * @param: tileHeight: The height of the matrixA/B tiles to be loaded at current iterate + * @param: tileWidth: The width of the matrixA/B tiles to be loaded at current iterate + * @return: Tensor on L1 + */ + template + __aicore__ inline LocalTensor LoadData(int curRow, int curCol, int tileHeight, int tileWidth, + const ScheduleContext& context = 0) + { + ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, "Matching error. This is an empty implementation."); }); + return NULL_TENSOR; + } + + /** + * @description: Release tensor on l1 at one compute end + * @param: tensor: The tensor on l1 need to be released + * @param: curRow: The row index of the matrixA/B at current iterate + * @param: curCol: The column index of the matrixA/B at current iterate + * @return: void + */ + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) {} + + /* + * @description: Reset buffer status used in copy in + * @return: void + */ + __aicore__ inline void Reset() {} + + /** + * @description: Destory tensor on l1 at iterate end + * @return: void + */ + __aicore__ inline void Destroy() {} +}; + +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _COPY_CUBE_IN_INTF_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h index 3fbcc4ed2c3a73370a751735f959174b236a020d..6d094d5b5095e832d0dca66818c207846a3aa1fd 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h @@ -1,124 +1,163 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_in_mdl.h - * \brief - */ - - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_MDL_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_MDL_H - -#include "data_copy_wrapper.h" -#include "copy_cube_in_intf.h" -#include "async_tensor.h" - -namespace matmul { -template -class CopyCubeIn::IsNeedUB() && GetCopyCubeInType() == CopyCubeInType::MDL>> -{ - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeTiling, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; - -public: - __aicore__ inline CopyCubeIn() = default; - __aicore__ inline ~CopyCubeIn() = default; - - __aicore__ inline void Init() - { - MATMUL_MODULE(CubeInBuffer)->Init( - MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(MatmulShapeTiling)->GetDepth()); - } - - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose) - { - MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) - { - MATMUL_MODULE(MatmulTensorInfo)->SetGlobalAddr(srcGlobalAddr, isTranspose); - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline void Reset() - { - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline LocalTensor LoadData( - int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) - { - LocalTensor l1; - int32_t posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol); - int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); - if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { - l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); - } else { - l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); - MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); - MATMUL_MODULE(CubeInBuffer)->EnQue(l1); - MATMUL_MODULE(CubeInBuffer)->DeQue(); - } - return l1; - } - - __aicore__ inline AsyncTensor AsyncLoadData( - int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) - { - if constexpr (PhyPosIsL1(INPUT_TYPE::pos) || INPUT_TYPE::layout != LayoutMode::NONE) { - ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, - "Matching error. MDL AsyncLoadData doesn't support BMM && Src L1"); }); - } - - LocalTensor l1; - int32_t posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol); - int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); - if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { - l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); - AsyncTensor l1Aync(l1, false); - return l1Aync; - } else { - l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); - MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); - MATMUL_MODULE(CubeInBuffer)->EnQue(l1); - AsyncTensor l1Aync(l1, true); - return l1Aync; - } - } - - __aicore__ inline void AwaitLoadData(const AsyncTensor& l1Aync) - { - MATMUL_MODULE(CubeInBuffer)->DeQue(); - } - - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) - { - auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); - MATMUL_MODULE(CubeInBuffer)->FreeTensor(bufferPos); - } - - __aicore__ inline void Destroy() - { - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } -}; -} // namespace matmul -#endif // _COPY_CUBE_IN_MDL_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_in_mdl.h + * \brief + */ + + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_MDL_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_MDL_H + +#include "data_copy_wrapper.h" +#include "copy_cube_in_intf.h" +#include "async_tensor.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeIn::IsNeedUB() && GetCopyCubeInType() == CopyCubeInType::MDL>> +{ + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + + __aicore__ inline void Init() + { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(CopyCubeInParams)->GetDepth()); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetLocalTensor(localMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetGlobalTensor(globalMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor l1; + int32_t posL1 = GetIterIndex(curRow, curCol); + int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { + l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); + } else { + l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); + MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); + MATMUL_MODULE(CubeInBuffer)->EnQue(l1); + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + return l1; + } + + template + __aicore__ inline AsyncTensor AsyncLoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + if constexpr (PhyPosIsL1(INPUT_TYPE::pos) || INPUT_TYPE::layout != LayoutMode::NONE) { + ASCENDC_ASSERT((false), { KERNEL_LOG(KERNEL_ERROR, + "Matching error. MDL AsyncLoadData doesn't support BMM && Src L1"); }); + } + + LocalTensor l1; + int32_t posL1 = GetIterIndex(curRow, curCol); + int32_t bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1, bufferPos)) { + l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1, bufferPos); + AsyncTensor l1Aync(l1, false); + return l1Aync; + } else { + l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(bufferPos); + MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); + MATMUL_MODULE(CubeInBuffer)->EnQue(l1); + AsyncTensor l1Aync(l1, true); + return l1Aync; + } + } + + __aicore__ inline void AwaitLoadData(const AsyncTensor& l1Aync) + { + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + { + auto bufferPos = MATMUL_MODULE(CopyCubeInParams)->GetBufferPos(); + MATMUL_MODULE(CubeInBuffer)->FreeTensor(bufferPos); + } + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + +private: + template + __aicore__ inline static enable_if_t + GetCurKPos(int32_t curRow, int32_t curCol) + { + return curCol; + } + + template + __aicore__ inline static enable_if_t + GetCurKPos(int32_t curRow, int32_t curCol) + { + return curRow; + } + + __aicore__ inline int32_t GetIterIndex(int32_t curRow, int32_t curCol) + { + return GetCurKPos(curRow, curCol) % GetMajorCacheNum(); + } + + __aicore__ inline int32_t GetMajorCacheNum() + { + if constexpr (INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb(); + } + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _COPY_CUBE_IN_MDL_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h index 48651b94fcdae69d4d03ece8454557e2ffb4a8ca..392f2713b58a9446f25d8a9178aa6cc904848d31 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h @@ -1,147 +1,234 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_in_norm.h - * \brief - */ - - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_NORM_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_NORM_H - -#include "data_copy_wrapper.h" -#include "copy_cube_in_intf.h" -#include "async_tensor.h" - -namespace matmul { -template -class CopyCubeIn::IsNeedUB() && GetCopyCubeInType() == CopyCubeInType::NORMAL>> -{ - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); - MATMUL_USE_MODULE(MatmulSubBlockInfo); - MATMUL_USE_MODULE_ON(MatmulShapeTiling, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; - -public: - __aicore__ inline CopyCubeIn() = default; - __aicore__ inline ~CopyCubeIn() = default; - - __aicore__ inline void Init() - { - MATMUL_MODULE(CubeInBuffer)->Init( - MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(MatmulShapeTiling)->GetDepth()); - } - - __aicore__ inline void SetInput(const TBuffAddr& address, bool isTranspose) - { - MATMUL_MODULE(MatmulTensorInfo)->SetLocalAddr(address, isTranspose); - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline void SetInput(__gm__ SrcT* srcGlobalAddr, bool isTranspose) - { - if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { - if (MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() == 0) { - MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); - } else { - MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); - } - } else { - MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalAddr(srcGlobalAddr, isTranspose); - } - MATMUL_MODULE(CubeInBuffer)->Reset(); - if constexpr (IsSameABTemplate()) { - MATMUL_MODULE(CubeInBuffer)->SetOrgAddr(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalAddr()); - } - } - - __aicore__ inline void Reset() - { - MATMUL_MODULE(CubeInBuffer)->Reset(); - } - - __aicore__ inline LocalTensor LoadData( - int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) - { - LocalTensor l1; - int32_t posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol); - if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) { - l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1); - } else { - l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1); - if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { - if (MATMUL_MODULE(MatmulSubBlockInfo)->IsFakeIntraBlock()) { - MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( - l1, curRow, curCol, tileHeight, tileWidth); - } else { - MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( - l1, curRow, curCol, tileHeight, tileWidth); - } - } else { - MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( - l1, curRow, curCol, tileHeight, tileWidth); - } - MATMUL_MODULE(CubeInBuffer)->EnQue(l1); - MATMUL_MODULE(CubeInBuffer)->DeQue(); - } - return l1; - } - - __aicore__ inline AsyncTensor AsyncLoadData( - int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth) - { - LocalTensor l1; - int32_t posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol); - if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) { - l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1); - AsyncTensor l1Aync(l1, false); - return l1Aync; - } else { - l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1); - MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); - MATMUL_MODULE(CubeInBuffer)->EnQue(l1); - AsyncTensor l1Aync(l1, true); - return l1Aync; - } - } - - __aicore__ inline void AwaitLoadData(const AsyncTensor& l1Aync) - { - if (l1Aync.IsNeedDeQue()) { - MATMUL_MODULE(CubeInBuffer)->DeQue(); - } - } - - __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, - int32_t curRow = 0, int32_t curCol = 0) - { -#if __CCE_AICORE__ == 310 - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { - return; - } -#endif - int32_t posL1 = MATMUL_MODULE(CubeInBuffer)->GetIterIndex(curRow, curCol); - MATMUL_MODULE(CubeInBuffer)->FreeTensor(posL1, tensor); - } - - __aicore__ inline void Destroy() - { - MATMUL_MODULE(CubeInBuffer)->Destroy(); - } -}; -} // namespace matmul -#endif // _COPY_CUBE_IN_NORM_H_ \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_in_norm.h + * \brief + */ + + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_NORM_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_NORM_H + +#include "data_copy_wrapper.h" +#include "copy_cube_in_intf.h" +#include "async_tensor.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeIn::IsNeedUB() && GetCopyCubeInType() == CopyCubeInType::NORMAL>> +{ + MATMUL_USE_MODULE_ON(CubeInBuffer, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(DataCopyUtils, INPUT_TYPE::TAG); + MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulSubBlockInfo); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + __aicore__ inline CopyCubeIn() = default; + __aicore__ inline ~CopyCubeIn() = default; + + __aicore__ inline void Init() + { + MATMUL_MODULE(CubeInBuffer)->Init( + MATMUL_MODULE(CopyCubeInParams)->GetBufferSize(), MATMUL_MODULE(CopyCubeInParams)->GetDepth()); + } + + __aicore__ inline void SetInput(const LocalTensor& localMatrix, bool isTranspose) + { + MATMUL_MODULE(MatmulTensorInfo)->SetLocalTensor(localMatrix, isTranspose); + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + __aicore__ inline void SetInput(const GlobalTensor& globalMatrix, bool isTranspose) + { + if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { + if (MATMUL_MODULE(MatmulSubBlockInfo)->GetSubBlockIdx() == 0) { + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalTensor(globalMatrix, isTranspose); + } else { + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalTensor(globalMatrix, isTranspose); + } + } else { + MATMUL_MODULE(MatmulTensorInfo)->template SetGlobalTensor(globalMatrix, isTranspose); + } + MATMUL_MODULE(CubeInBuffer)->Reset(); + if constexpr (IsSameABTemplate()) { + MATMUL_MODULE(CubeInBuffer)->SetOrgTensor(MATMUL_MODULE(MatmulTensorInfo)->GetGlobalTensor()); + } + } + + __aicore__ inline void Reset() + { + MATMUL_MODULE(CubeInBuffer)->Reset(); + } + + template + __aicore__ inline LocalTensor LoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor l1; + auto posL1 = GetIterIndex(curRow, curCol); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) { + l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1); + } else { + l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1); + if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { + if (MATMUL_MODULE(MatmulSubBlockInfo)->IsFakeIntraBlock()) { + MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( + l1, curRow, curCol, tileHeight, tileWidth); + } else { + MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( + l1, curRow, curCol, tileHeight, tileWidth); + } + } else { + MATMUL_MODULE(DataCopyUtils)->template CopyTileToCube( + l1, curRow, curCol, tileHeight, tileWidth); + } + MATMUL_MODULE(CubeInBuffer)->EnQue(l1); + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + return l1; + } + + template + __aicore__ inline AsyncTensor AsyncLoadData( + int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, const ScheduleContext& context = 0) + { + LocalTensor l1; + auto posL1 = GetIterIndex(curRow, curCol); + if (MATMUL_MODULE(CubeInBuffer)->Hit(posL1)) { + l1 = MATMUL_MODULE(CubeInBuffer)->GetBuffer(posL1); + AsyncTensor l1Aync(l1, false); + return l1Aync; + } else { + l1 = MATMUL_MODULE(CubeInBuffer)->AllocTensor(posL1); + MATMUL_MODULE(DataCopyUtils)->CopyTileToCube(l1, curRow, curCol, tileHeight, tileWidth); + MATMUL_MODULE(CubeInBuffer)->EnQue(l1); + AsyncTensor l1Aync(l1, true); + return l1Aync; + } + } + + __aicore__ inline void AwaitLoadData(const AsyncTensor& l1Aync) + { + if (l1Aync.IsNeedDeQue()) { + MATMUL_MODULE(CubeInBuffer)->DeQue(); + } + } + + __aicore__ inline void ClearLoadData(const LocalTensor& tensor = NULL_TENSOR, + int32_t curRow = 0, int32_t curCol = 0) + { +#if __CCE_AICORE__ == 310 + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return; + } +#endif + auto posL1 = GetIterIndex(curRow, curCol); + MATMUL_MODULE(CubeInBuffer)->FreeTensor(posL1, tensor); + } + + __aicore__ inline void Destroy() + { + MATMUL_MODULE(CubeInBuffer)->Destroy(); + } + +private: + __aicore__ constexpr int32_t GetIterIndex(int32_t curRow, int32_t curCol) + { + if constexpr (GetCubeInBufferType() == CubeInBufferType::SINGLE_BUFFER) { + return 0; + } else if constexpr (GetCubeInBufferType() == CubeInBufferType::NORMAL || + GetCubeInBufferType() == CubeInBufferType::SINGLE_GLOBAL_BUFFER || + GetCubeInBufferType() == CubeInBufferType::DOUBLE_GLOBAL_BUFFER) { + return GetIterIndexInner(curRow, curCol); + } + } + + template + __aicore__ constexpr enable_if_t + GetIterIndexInner(int32_t curRow, int32_t curCol) + { + if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulIBShareNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG)) { + auto& var = MATMUL_PARAM_VAR; + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_M)) { + return curCol; + } else { + if constexpr (INPUT_TYPE::layout == LayoutMode::NONE && + ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { + return (curRow + (curRow / DOUBLE_QUE) * DOUBLE_QUE * (var.kIter_ - 1) + curCol * DOUBLE_QUE) % + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * var.kIter_); + } + return (curRow * var.kIter_ + curCol) % + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * var.kIter_); + } + } else if constexpr (DoMatmulSpecialBasicBlock(MM_CFG)) { + auto& var = MATMUL_PARAM_VAR; + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_M)) { + return curCol; + } else { + return (curRow * var.kIter_ + curCol) % + (ToMatmulConfig(MM_CFG).stepM * ToMatmulConfig(MM_CFG).singleCoreK / + ToMatmulConfig(MM_CFG).basicK); + } + } else { + return 0; + } + } + + template + __aicore__ constexpr enable_if_t + GetIterIndexInner(int32_t curRow, int32_t curCol) + { + if constexpr (DoMatmulNorm(MM_CFG) || DoMatmulIBShareNorm(MM_CFG) || DoMatmulBasicBlock(MM_CFG)) { + auto& var = MATMUL_PARAM_VAR; + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_M)) { + if constexpr (INPUT_TYPE::layout == LayoutMode::NONE && + ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { + return (curCol + (curCol / DOUBLE_QUE) * DOUBLE_QUE * (var.kIter_ - 1) + curRow * DOUBLE_QUE) % + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * var.kIter_); + } + return (curRow + curCol * var.kIter_) % + (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * var.kIter_); + } else { + return curRow; + } + } else if (DoMatmulSpecialBasicBlock(MM_CFG)) { + auto& var = MATMUL_PARAM_VAR; + if (MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetIterateOrder() == + static_cast(IterateOrder::ORDER_M)) { + return (curRow + curCol * var.kIter_) % ToMatmulConfig(MM_CFG).stepN * + ToMatmulConfig(MM_CFG).singleCoreK / ToMatmulConfig(MM_CFG).basicK; + } else { + return curRow; + } + } else { + return 0; + } + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // _COPY_CUBE_IN_NORM_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h index 0d1a1838bad3421c38aa171819f8c280ae741ffb..8a2bc529fd0e045c5e2a4ab0a528d70b836a3b5d 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h @@ -1,456 +1,469 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_in_params.h - * \brief copy cube in variable manager module - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_PARAMS_MANAGER_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_PARAMS_MANAGER_H - -#include "../../matmul_var.h" - -namespace matmul { -template -class CopyCubeInParams { - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; -public: - template - __aicore__ inline int32_t GetStepCol() const - { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepM(); - } - } - - template - __aicore__ inline int32_t GetStepRow() const - { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepM(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa(); - } - } - - __aicore__ inline int32_t GetMaxLayoutInfoG() const - { - return MATMUL_CONST_PARAM_VAR.tiling_.GetALayoutInfoG() > MATMUL_CONST_PARAM_VAR.tiling_.GetBLayoutInfoG() ? - MATMUL_CONST_PARAM_VAR.tiling_.GetALayoutInfoG() : MATMUL_CONST_PARAM_VAR.tiling_.GetBLayoutInfoG(); - } - - __aicore__ inline int32_t GetBufferPos() const - { - return MATMUL_CONST_PARAM_VAR.isA1KFullLoad_ ? - MATMUL_CONST_PARAM_VAR.stepMIdx_: MATMUL_CONST_PARAM_VAR.stepKaIdx_; - } - - template - __aicore__ inline int32_t GetSingleHeightAlign() const - { - if constexpr (IS_TRANS) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(), c0Size_); - } else { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(), BLOCK_CUBE); - } - } - - template - __aicore__ inline int32_t GetSingleWidthAlign() const - { - if (!IS_TRANS || IsSameTypeV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth(), c0Size_); - } else { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth(), BLOCK_CUBE); - } - } - - __aicore__ inline int32_t GetSingleSize() const - { - return MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth() * MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(); - } - - template - __aicore__ inline int32_t GetSingleSizeAlign() const - { - return GetSingleWidthAlign() * GetSingleHeightAlign(); - } - - __aicore__ inline int32_t GetCopyHeight(int32_t i) const - { - return (MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_ - 1) ? - MATMUL_CONST_PARAM_VAR.tailStepM_ : - MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseM(); - } - - template - __aicore__ inline int32_t GetCopyWidth(int32_t i, int32_t baseWidth) const - { - if constexpr (IS_TRANS) { - return (MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_ - 1) ? - MATMUL_CONST_PARAM_VAR.tailStepM_: - MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * baseWidth; - } else { - return (MATMUL_CONST_PARAM_VAR.stepKaIdx_ + i >= MATMUL_CONST_PARAM_VAR.kaStepIter_ - 1) ? - MATMUL_CONST_PARAM_VAR.tailStepKa_ : - MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa() * baseWidth; - } - } - - template - __aicore__ inline int32_t GetStaticTileHeight() const - { - if constexpr ((INPUT_TYPE::layout != LayoutMode::NONE) && - (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreK(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreM(); - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseK(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseM(); - } - } else { - return MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(); - } - } - - template - __aicore__ inline int32_t GetStaticTileWidth() const - { - if constexpr ((INPUT_TYPE::layout != LayoutMode::NONE) && - (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreM(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreK(); - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepM() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseM(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKa() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseK(); - } - } else { - return MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(); - } - } - - __aicore__ inline int32_t GetBufferSize() - { -#if __CCE_AICORE__ == 310 - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { - return GetOrgSize(); - } else { - return GetBaseSize(); - } -#else - return GetBaseSize(); -#endif - } - - __aicore__ inline bool IsL1KFullLoad() const - { - return MATMUL_CONST_PARAM_VAR.isA1KFullLoad_; - } - - __aicore__ inline bool IsBufferPosEnd(int32_t i) const - { - return MATMUL_CONST_PARAM_VAR.stepMIdx_ + i >= MATMUL_CONST_PARAM_VAR.mStepIter_; - } - - __aicore__ inline bool IsBufferPosEnd() const - { - return MATMUL_CONST_PARAM_VAR.stepMIdx_ == MATMUL_CONST_PARAM_VAR.mStepIter_ - 1; - } - - __aicore__ inline bool IsBufferKPosEnd(int32_t i) const - { - return MATMUL_CONST_PARAM_VAR.stepKaIdx_ + i >= MATMUL_CONST_PARAM_VAR.kaStepIter_; - } - - __aicore__ inline bool IsBufferKPosEnd() const - { - return MATMUL_CONST_PARAM_VAR.stepKaIdx_ == MATMUL_CONST_PARAM_VAR.kaStepIter_ - 1; - } - -private: - constexpr static int32_t c0Size_ = AuxGetC0Size(); - - __aicore__ inline int32_t GetBaseHeightAlign() const - { - if constexpr (IsSameTypeV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV && INPUT_TYPE::isTrans) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(), c0Size_); - } else { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(); - } - } - - __aicore__ inline int32_t GetBaseWidthAlign() const - { - if constexpr (INPUT_TYPE::isTrans && IsSameTypeV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(), c0Size_); - } else { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(); - } - } - - __aicore__ inline int32_t GetOrgHeightAlign() - { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetOrgHeight(), BLOCK_CUBE); - } - - __aicore__ inline int32_t GetOrgWidthAlign() - { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetOrgWidth(), c0Size_); - } - - __aicore__ inline int32_t GetBaseSize() - { - if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(); - } else { - return GetBaseHeightAlign() * GetBaseWidthAlign(); - } - } - - __aicore__ inline int32_t GetOrgSize() - { - return GetOrgHeightAlign() * GetOrgWidthAlign(); - } -}; - -template -class CopyCubeInParams> { - MATMUL_USE_MODULE_ON(MatmulVar, InputTypeTag::B); - MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG); - using TransT = typename INPUT_TYPE::TRANS_T; - using SrcT = typename INPUT_TYPE::T; -public: - template - __aicore__ inline int32_t GetStepCol() const - { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKb(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepN(); - } - } - - template - __aicore__ inline int32_t GetStepRow() const - { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepN(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKb(); - } - } - - __aicore__ inline int32_t GetMaxLayoutInfoG() const - { - return MATMUL_CONST_PARAM_VAR.tiling_.GetALayoutInfoG() > MATMUL_CONST_PARAM_VAR.tiling_.GetBLayoutInfoG() ? - MATMUL_CONST_PARAM_VAR.tiling_.GetALayoutInfoG() : MATMUL_CONST_PARAM_VAR.tiling_.GetBLayoutInfoG(); - } - - __aicore__ inline int32_t GetBufferPos() const - { - return MATMUL_CONST_PARAM_VAR.isB1KFullLoad_ ? MATMUL_CONST_PARAM_VAR.stepNIdx_: - MATMUL_CONST_PARAM_VAR.stepKbIdx_; - } - - template - __aicore__ inline int32_t GetSingleHeightAlign() const - { - if constexpr (IS_TRANS || IsSameTypeV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(), c0Size_); - } else { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(), BLOCK_CUBE); - } - } - - template - __aicore__ inline int32_t GetSingleWidthAlign() const - { - if constexpr (IS_TRANS) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth(), BLOCK_CUBE); - } else { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth(), c0Size_); - } - } - - __aicore__ inline int32_t GetSingleSize() const - { - return MATMUL_MODULE(MatmulShapeInfo)->GetSingleWidth() * MATMUL_MODULE(MatmulShapeInfo)->GetSingleHeight(); - } - - template - __aicore__ inline int32_t GetSingleSizeAlign() const - { - return GetSingleWidthAlign() * GetSingleHeightAlign(); - } - - __aicore__ inline int32_t GetCopyHeight(int32_t i) const - { - return (MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_ - 1) ? - MATMUL_CONST_PARAM_VAR.tailStepN_ : - MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseN(); - } - - template - __aicore__ inline int32_t GetCopyWidth(int32_t i, int32_t baseWidth) const - { - return (MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_ - 1) ? - MATMUL_CONST_PARAM_VAR.tailStepN_ : - MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * baseWidth; - } - - template - __aicore__ inline int32_t GetStaticTileHeight() const - { - if constexpr ((INPUT_TYPE::layout != LayoutMode::NONE) && - (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreN(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreK(); - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseN(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKb() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseK(); - } - } else { - return MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(); - } - } - - template - __aicore__ inline int32_t GetStaticTileWidth() const - { - if constexpr ((INPUT_TYPE::layout != LayoutMode::NONE) && - (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreK(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetSingleCoreN(); - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - if constexpr (IS_TRANS) { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepKb() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseK(); - } else { - return MATMUL_CONST_PARAM_VAR.tiling_.GetStepN() * MATMUL_CONST_PARAM_VAR.tiling_.GetBaseN(); - } - } else { - return MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(); - } - } - - __aicore__ inline int32_t GetBufferSize() - { -#if __CCE_AICORE__ == 310 - if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { - return GetOrgSize(); - } else { - return GetBaseSize(); - } -#else - return GetBaseSize(); -#endif - } - - __aicore__ inline bool IsL1KFullLoad() const - { - return MATMUL_CONST_PARAM_VAR.isB1KFullLoad_; - } - - __aicore__ inline bool IsBufferPosEnd(int32_t i) const - { - return MATMUL_CONST_PARAM_VAR.stepNIdx_ + i >= MATMUL_CONST_PARAM_VAR.nStepIter_; - } - - __aicore__ inline bool IsBufferPosEnd() const - { - return MATMUL_CONST_PARAM_VAR.stepNIdx_ == MATMUL_CONST_PARAM_VAR.nStepIter_ - 1; - } - - __aicore__ inline bool IsBufferKPosEnd(int32_t i) const - { - return MATMUL_CONST_PARAM_VAR.stepKbIdx_ + i >= MATMUL_CONST_PARAM_VAR.kbStepIter_; - } - - __aicore__ inline bool IsBufferKPosEnd() const - { - return MATMUL_CONST_PARAM_VAR.stepKbIdx_ == MATMUL_CONST_PARAM_VAR.kbStepIter_ - 1; - } - -private: - constexpr static int32_t c0Size_ = AuxGetC0Size(); - - __aicore__ inline int32_t GetBaseHeightAlign() const - { - if constexpr (IsSameTypeV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(), BLOCK_CUBE); - } else if constexpr (IsTypeOneOfV) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(), c0Size_); - } else { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseHeight(); - } - } - - __aicore__ inline int32_t GetBaseWidthAlign() const - { - if constexpr (IsSameTypeV || (IsTypeOneOfV && !INPUT_TYPE::isTrans)) { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(), c0Size_); - } else { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(); - } - } - - __aicore__ inline int32_t GetOrgHeightAlign() - { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetOrgHeight(), BLOCK_CUBE); - } - - __aicore__ inline int32_t GetOrgWidthAlign() - { - return Align(MATMUL_MODULE(MatmulShapeInfo)->GetOrgWidth(), c0Size_); - } - - __aicore__ inline int32_t GetBaseSize() - { - if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { - return MATMUL_MODULE(MatmulShapeInfo)->GetBaseWidth(); - } else { - return GetBaseHeightAlign() * GetBaseWidthAlign(); - } - } - - __aicore__ inline int32_t GetOrgSize() - { - return GetOrgHeightAlign() * GetOrgWidthAlign(); - } -}; -} // namespace matmul -#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_PARAMS_MANAGER_H \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_in_params.h + * \brief copy cube in variable manager module + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_PARAMS_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_PARAMS_H + +#include "../../param/matmul_shape_tiling.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +template +class CopyCubeInParams { + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; + +public: + template + __aicore__ inline int32_t GetStepCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } + } + + template + __aicore__ inline int32_t GetStepRow() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa(); + } + } + + __aicore__ inline int32_t GetBufferPos() const + { + return MATMUL_CONST_PARAM_VAR.isA1KFullLoad_ ? + MATMUL_CONST_PARAM_VAR.stepMIdx_: MATMUL_CONST_PARAM_VAR.stepKaIdx_; + } + + __aicore__ inline int32_t GetBufferSize() + { +#if __CCE_AICORE__ == 310 + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return GetOrgSize(); + } else { + return GetBaseSize(); + } +#else + return GetBaseSize(); +#endif + } + + __aicore__ inline int GetDepth() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthA1(); + } + + template + __aicore__ inline int32_t GetOrgHeight() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleHeight(), BLOCK_CUBE); + } else { + return GetSingleHeight(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKa(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(); + } + } + } + + template + __aicore__ inline int32_t GetOrgWidth() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleWidth(), c0Size_); + } else { + return GetSingleWidth(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKa(); + } + } + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + // Constantized scenario + // You can set IS_BASIC to false, if you don't need to use constantized parameters + if constexpr (IS_BASIC) { + if constexpr (IS_TRANS) { + return ToMatmulConfig(MM_CFG).singleCoreK; + } else { + return ToMatmulConfig(MM_CFG).singleCoreM; + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreM(); + } + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + // Constantized scenario + // You can set IS_BASIC to false, if you don't need to use constantized parameters + if constexpr (IS_BASIC) { + if constexpr (IS_TRANS) { + return ToMatmulConfig(MM_CFG).singleCoreM; + } else { + return ToMatmulConfig(MM_CFG).singleCoreK; + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreM(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetSingleCoreK(); + } + } + } + + template + __aicore__ inline int32_t GetBaseHeight() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } + } + + template + __aicore__ inline int32_t GetBaseWidth() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } + + template + __aicore__ inline int32_t GetTotalRow() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetMIter(); + } + } + + template + __aicore__ inline int32_t GetTotalCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetMIter(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); + } + } + + template + __aicore__ inline bool IsKRowDirec() const + { + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeA(); + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline int32_t GetBaseHeightAlign() const + { + if constexpr (IsSameTypeV) { + return Align(GetBaseHeight(), BLOCK_CUBE); + } else if constexpr (IsTypeOneOfV && INPUT_TYPE::isTrans) { + return Align(GetBaseHeight(), c0Size_); + } else { + return GetBaseHeight(); + } + } + + __aicore__ inline int32_t GetBaseWidthAlign() const + { + if constexpr (INPUT_TYPE::isTrans && IsSameTypeV) { + return Align(GetBaseWidth(), BLOCK_CUBE); + } else if constexpr (IsTypeOneOfV) { + return Align(GetBaseWidth(), c0Size_); + } else { + return GetBaseWidth(); + } + } + + __aicore__ inline int32_t GetOrgHeightAlign() + { + return Align(GetOrgHeight(), BLOCK_CUBE); + } + + __aicore__ inline int32_t GetOrgWidthAlign() + { + return Align(GetOrgWidth(), c0Size_); + } + + __aicore__ inline int32_t GetBaseSize() + { + if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { + return GetBaseWidth(); + } else { + return GetBaseHeightAlign() * GetBaseWidthAlign(); + } + } + + __aicore__ inline int32_t GetOrgSize() + { + return GetOrgHeightAlign() * GetOrgWidthAlign(); + } +}; + +template +class CopyCubeInParams> { + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + using TransT = typename INPUT_TYPE::TRANS_T; + using SrcT = typename INPUT_TYPE::T; +public: + template + __aicore__ inline int32_t GetStepCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } + } + + template + __aicore__ inline int32_t GetStepRow() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb(); + } + } + + __aicore__ inline int32_t GetBufferPos() const + { + return MATMUL_CONST_PARAM_VAR.isB1KFullLoad_ ? MATMUL_CONST_PARAM_VAR.stepNIdx_: + MATMUL_CONST_PARAM_VAR.stepKbIdx_; + } + + __aicore__ inline int32_t GetBufferSize() + { +#if __CCE_AICORE__ == 310 + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + return GetOrgSize(); + } else { + return GetBaseSize(); + } +#else + return GetBaseSize(); +#endif + } + + __aicore__ inline int GetDepth() const + { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetDepthB1(); + } + + template + __aicore__ inline int32_t GetOrgHeight() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleHeight(), BLOCK_CUBE); + } else { + return GetSingleHeight(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKb(); + } + } + } + + template + __aicore__ inline int32_t GetOrgWidth() + { + if constexpr (PhyPosIsUB(INPUT_TYPE::pos)) { + if constexpr (IS_INTRA_BLOCK) { + return CeilAlign(GetSingleWidth(), c0Size_); + } else { + return GetSingleWidth(); + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKb(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(); + } + } + } + + template + __aicore__ inline int32_t GetSingleHeight() const + { + // Constantized scenario + if constexpr (IS_BASIC) { + if constexpr (IS_TRANS) { + return ToMatmulConfig(MM_CFG).singleCoreN; + } else { + return ToMatmulConfig(MM_CFG).singleCoreK; + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(); + } + } + } + + template + __aicore__ inline int32_t GetSingleWidth() const + { + // Constantized scenario + if constexpr (IS_BASIC) { + if constexpr (IS_TRANS) { + return ToMatmulConfig(MM_CFG).singleCoreK; + } else { + return ToMatmulConfig(MM_CFG).singleCoreN; + } + } else { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + } + } + } + + template + __aicore__ inline int32_t GetBaseHeight() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } + + template + __aicore__ inline int32_t GetBaseWidth() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } + } + + template + __aicore__ inline int32_t GetTotalRow() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetNIter(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); + } + } + + template + __aicore__ inline int32_t GetTotalCol() const + { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->GetNIter(); + } + } + + template + __aicore__ inline bool IsKRowDirec() const + { + return !MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeB(); + } + +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); + + __aicore__ inline int32_t GetBaseHeightAlign() const + { + if constexpr (IsSameTypeV) { + return Align(GetBaseHeight(), BLOCK_CUBE); + } else if constexpr (IsTypeOneOfV) { + return Align(GetBaseHeight(), c0Size_); + } else { + return GetBaseHeight(); + } + } + + __aicore__ inline int32_t GetBaseWidthAlign() const + { + if constexpr (IsSameTypeV || (IsTypeOneOfV && !INPUT_TYPE::isTrans)) { + return Align(GetBaseWidth(), c0Size_); + } else { + return GetBaseWidth(); + } + } + + __aicore__ inline int32_t GetOrgHeightAlign() + { + return Align(GetOrgHeight(), BLOCK_CUBE); + } + + __aicore__ inline int32_t GetOrgWidthAlign() + { + return Align(GetOrgWidth(), c0Size_); + } + + __aicore__ inline int32_t GetBaseSize() + { + if constexpr (INPUT_TYPE::format == CubeFormat::VECTOR && !IsTypeOneOfV) { + return GetBaseWidth(); + } else { + return GetBaseHeightAlign() * GetBaseWidthAlign(); + } + } + + __aicore__ inline int32_t GetOrgSize() + { + return GetOrgHeightAlign() * GetOrgWidthAlign(); + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_PARAMS_H diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h index 19bb76f9060a4e9407c4db3537553f1f8f70ed72..ffc93f600a865ae9e0bdeb13790f6dd2d4dd7941 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h @@ -20,7 +20,9 @@ #include "../../matmul_param.h" #include "copy_cube_in_intf.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { using namespace AscendC; constexpr int32_t FIRST_16BIT_OFFSET_MM_API = 16; @@ -32,7 +34,11 @@ constexpr int32_t CACHE_LINE_SIZE_MM_API = 512; constexpr int32_t TRANS_DATA_ARRAY_SIZE_MM_API = 16; constexpr int32_t ANTI_QUANT_ALIGN_SIZE_MM_API = 32; constexpr int32_t MAX_BLOCK_COUNT_SIZE_MM_API = 4095; - +/* + CopyCubeIn is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeIn is only for internal usage, does not support extension or customized specialization! +*/ template class CopyCubeIn::IsNeedUB() && @@ -1583,5 +1589,12 @@ private: LocalTensor cacheHead2UB_; // Allocate and release using qidUBCache_ int32_t cache2UBProc_ = 0; }; +<<<<<<< HEAD } // namespace matmul -#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H \ No newline at end of file +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H +======= +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H +>>>>>>> 5b5aaa4 (!216 rename matmul namespace) diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h index 604f32e9772518f606c2425ef710769bd94ecd48..95290033ef266a78923b4e636b9fa8f0dc4a55c7 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h @@ -1,64 +1,68 @@ -/** -* Copyright (c) 2024 Huawei Technologies Co., Ltd. -* This file is a part of the CANN Open Software. -* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -* Please refer to the License for details. You may not use this file except in compliance with the License. -* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -* See LICENSE in the root of the software repository for the full text of the License. -*/ -/*! -* \file cube_in_buffer_utils.h -* \brief -*/ -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_UTILS_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_UTILS_H - -#include "../../feature_trait/matmul_feature_trait.h" - -namespace matmul { -enum class CopyCubeInType : uint8_t { - NONE = 0, - NORMAL = 1, - MDL, - BMM, - FROM_L1, -}; - -template -__aicore__ inline constexpr bool IsSameABTemplate() -{ - return DoMatmulIBShareNorm(MM_CFG) && INPUT_TYPE::ibShare; -} - -template -__aicore__ inline constexpr bool IsCopyFromUB() -{ - return PhyPosIsUB(INPUT_TYPE::pos) && MatmulFeatureTrait().IsSupportUBToL1(); -} - -template -__aicore__ inline constexpr CopyCubeInType GetCopyCubeInType() -{ - if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { - return CopyCubeInType::FROM_L1; - } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { - return CopyCubeInType::NORMAL; - } else if constexpr (DoMatmulNorm(MM_CFG)) { - if constexpr (INPUT_TYPE::layout != LayoutMode::NONE && - ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1) { - return CopyCubeInType::BMM; - } else { - return CopyCubeInType::NORMAL; - } - } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { - return CopyCubeInType::MDL; - } else if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) { - return CopyCubeInType::NORMAL; - } else { - return CopyCubeInType::NONE; - } -} - -} +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ +/*! +* \file cube_in_buffer_utils.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_UTILS_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_UTILS_H + +#include "../../feature_trait/matmul_feature_trait.h" + +namespace Gemm { +namespace Impl { +namespace Detail { +enum class CopyCubeInType : uint8_t { + NONE = 0, + NORMAL = 1, + MDL, + BMM, + FROM_L1, +}; + +template +__aicore__ inline constexpr bool IsSameABTemplate() +{ + return DoMatmulIBShareNorm(MM_CFG) && INPUT_TYPE::ibShare; +} + +template +__aicore__ inline constexpr bool IsCopyFromUB() +{ + return PhyPosIsUB(INPUT_TYPE::pos) && MatmulFeatureTrait().IsSupportUBToL1(); +} + +template +__aicore__ inline constexpr CopyCubeInType GetCopyCubeInType() +{ + if constexpr (PhyPosIsL1(INPUT_TYPE::pos)) { + return CopyCubeInType::FROM_L1; + } else if constexpr (DoMatmulIBShareNorm(MM_CFG)) { + return CopyCubeInType::NORMAL; + } else if constexpr (DoMatmulNorm(MM_CFG)) { + if constexpr (INPUT_TYPE::layout != LayoutMode::NONE && + ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1) { + return CopyCubeInType::BMM; + } else { + return CopyCubeInType::NORMAL; + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + return CopyCubeInType::MDL; + } else if constexpr (DoMatmulBasicBlock(MM_CFG) || DoMatmulSpecialBasicBlock(MM_CFG)) { + return CopyCubeInType::NORMAL; + } else { + return CopyCubeInType::NONE; + } +} + +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // _COPY_CUBE_IN_UTILS_H_ \ No newline at end of file diff --git a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h index 3ec547f641f38be6a772cfe0dcaf3334f9ae4eef..2f6ebe4db172f1811622c602b14b4e7ff3914cfb 100644 --- a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h +++ b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h @@ -17,25 +17,109 @@ #include "../../matmul_module.h" #include "../../matmul_param.h" -#include "../../matmul_var.h" #include "copy_cube_in_utils.h" #include "copy_cube_in_params.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { using namespace AscendC; -constexpr int32_t INT4_TWO = 2; - template class DataCopyWrapper { using TransT = typename INPUT_TYPE::TRANS_T; using SrcT = typename INPUT_TYPE::T; - MATMUL_USE_MODULE_ON(MatmulVar, INPUT_TYPE::TAG); - MATMUL_USE_MODULE_ON(MatmulShapeInfo, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(CopyCubeInParams, INPUT_TYPE::TAG); MATMUL_USE_MODULE_ON(MatmulTensorInfo, INPUT_TYPE::TAG); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulUserDefineInfo); + + template + __aicore__ constexpr enable_if_t GetStaticTileHeight() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreM(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(); + } + } + + template + __aicore__ constexpr enable_if_t GetStaticTileWidth() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKa() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(); + } + } + + template + __aicore__ inline enable_if_t GetStaticTileHeight() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(); + } + } + + template + __aicore__ inline enable_if_t GetStaticTileWidth() const + { + if constexpr ((INPUT_TYPE_ALIAS::layout != LayoutMode::NONE) && + (ToMatmulConfig(MM_CFG).batchMode != BatchMode::SINGLE_LARGE_THAN_L1)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetSingleCoreN(); + } + } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) { + if constexpr (IS_TRANS) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepKb() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseK(); + } else { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetStepN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + } + } else { + return MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(); + } + } public: __aicore__ inline DataCopyWrapper() = default; @@ -49,596 +133,100 @@ public: if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyA1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyB1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); #else if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyA1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { LocalTensor tmpDst = dst.template ReinterpretCast(); (IMPL::CallBack::CopyB1Ptr)(tmpDst, - reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()), - curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulTensorInfo)->GetUserDefineInfo(), - MATMUL_MODULE(MatmulTensorInfo)->GetSelfDefineData()); + reinterpret_cast<__gm__ void *>(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_), + curRow, curCol, tileHeight, tileWidth, MATMUL_MODULE(MatmulUserDefineInfo)->GetUserDefineInfo(), + MATMUL_MODULE(MatmulUserDefineInfo)->GetSelfDefineData()); #endif } else { constexpr int32_t widthFactor = IsSameTypeV && INPUT_TYPE::format == CubeFormat::ND ? INT4_TWO : 1; - if (MATMUL_MODULE(MatmulShapeInfo)->template IsTranspose()) { + if (IsTranspose()) { if constexpr (IsCopyFromUB()) { LocalTensor src; - src.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); + src.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); CopyTileToCubeFromUB( dst, src, curCol, curRow, tileWidth, tileHeight / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth() / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template IsKRowDirec()); + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth() / widthFactor, + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); } else { GlobalTensor src; - src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()); + src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_); CopyTileToCubeFromGM(dst, src, curCol, curRow, tileWidth, tileHeight / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth() / widthFactor, + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth() / widthFactor, MATMUL_MODULE(CopyCubeInParams)->template GetStepCol(), - MATMUL_MODULE(MatmulShapeInfo)->template IsKRowDirec()); + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); } } else { if constexpr (IsCopyFromUB()) { LocalTensor src; - src.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalAddr()); + src.SetAddr(MATMUL_MODULE(MatmulTensorInfo)->GetLocalTensor().address_); CopyTileToCubeFromUB( dst, src, curRow, curCol, tileHeight, tileWidth / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth() / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template IsKRowDirec()); + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth() / widthFactor, + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); } else { GlobalTensor src; - src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalAddr()); + src.SetGlobalBuffer(MATMUL_MODULE(MatmulTensorInfo)->template GetGlobalTensor().address_); CopyTileToCubeFromGM( dst, src, curRow, curCol, tileHeight, tileWidth / widthFactor, - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetBaseWidth(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgHeight(), - MATMUL_MODULE(MatmulShapeInfo)->template GetOrgWidth() / widthFactor, + MATMUL_MODULE(CopyCubeInParams)->template GetBaseHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetBaseWidth(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgHeight(), + MATMUL_MODULE(CopyCubeInParams)->template GetOrgWidth() / widthFactor, MATMUL_MODULE(CopyCubeInParams)->template GetStepCol(), - MATMUL_MODULE(MatmulShapeInfo)->template IsKRowDirec()); - } - } - } - } - - __aicore__ inline void BatchCopyND2NZ(const LocalTensor &dst, const GlobalTensor &src, - const int row, const int col, const int height, const int width, const int gCol, const int ndNum = 1, - const int srcNdMatrixStride = 0, const int dstNzMatrixStride = 0, const bool kAlignToC0Size = false) - { -#ifdef ASCENDC_CPU_DEBUG - if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); - } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); -#else - if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); - } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); -#endif - } else { -#if __CCE_AICORE__ >=220 - CopyND2NZ(dst, src, row, col, height, width, gCol, ndNum, srcNdMatrixStride, dstNzMatrixStride, kAlignToC0Size); -#endif - } - } - - __aicore__ inline void BatchCopyNZ2NZ(const LocalTensor& dst, const LocalTensor& src, int row, - int col, int height, int width, int gRow, bool kAlignToC0Size = false) - { - CopyNZ2NZ(dst, src, row, col, height, width, gRow); - } - - __aicore__ inline void BatchCopyNZ2NZ(const LocalTensor& dst, const GlobalTensor& src, - const int row, const int col, const int height, const int width, const int gRow, - const bool kAlignToC0Size = false) - { -#ifdef ASCENDC_CPU_DEBUG - if (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); - } else if (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); -#else - if constexpr (INPUT_TYPE::TAG == InputTypeTag::A && IMPL::CallBack::CopyA1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyA1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); - } else if constexpr (INPUT_TYPE::TAG == InputTypeTag::B && IMPL::CallBack::CopyB1Ptr) { - LocalTensor a1Tmp = dst.template ReinterpretCast(); - (IMPL::CallBack::CopyB1Ptr)(a1Tmp, reinterpret_cast<__gm__ void *>(src.address_), row, col, height, width, MATMUL_CONST_PARAM_VAR.tilingPtr_, - MATMUL_CONST_PARAM_VAR.dataPtr_); -#endif - } else { - CopyNZ2NZ(dst, src, row, col, height, width, gRow, kAlignToC0Size); - } - } - -#if __CCE_AICORE__ < 220 - __aicore__ inline void CopyND2NZOnTheFly(const LocalTensor& dst, const GlobalTensor& src, int row, - int col, int height, int width, int gCol) - { - ASSERT(gCol >= width && "Copy ND block gm->ub width larger than origin matrix width."); - int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero - int dstOffset = 0; - int64_t srcOffset = (static_cast(row) * static_cast(gCol) + static_cast(col)); - int calcWidthExr = Ceil(width, c0Size_); - int calcHeightExr = Ceil(height, BLOCK_CUBE); - -#if __CCE_AICORE__ == 200 - // set2d, pad tail zero - if (height % BLOCK_CUBE != 0) { - int64_t repeat = calcWidthExr * calcHeightExr; - if constexpr (IsSameType::value) { - LocalTensor tmp = dst.template ReinterpretCast(); - InitConstValueParams initConstValueParams; - initConstValueParams.repeatTimes = static_cast(repeat); - initConstValueParams.initValue = 0; - InitConstValue(tmp, initConstValueParams); - } else { - InitConstValueParams initConstValueParams; - initConstValueParams.repeatTimes = static_cast(repeat); - initConstValueParams.initValue = 0; - InitConstValue(dst, initConstValueParams); - } - PipeBarrier(); - } -#endif - - // gCol unaligned, can not use dma copy repeat stride - int tail = width % c0Size_; - if (tail) { - // tail elements that need to be pad zero - int blockLen = calcWidthExr * (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); - - // gm->l1 - int src_gap = gCol * sizeof(SrcT) / ONE_BLK_SIZE - 1; - if (gCol % c0Size_ || src_gap >= UINT16_MAX) { - // each block len is only 32B - for (auto i = 0; i < calcWidth; i++) { - for (auto j = 0; j < height; j++) { - DataCopy(dst[dstOffset + i * calcHeightExr * BLOCK_CUBE * c0Size_ + j * c0Size_], - src[srcOffset + j * gCol + i * c0Size_], { 1, 1, 0, 0 }); - } - } - } else { - // data copy stride is aligned - for (auto i = 0; i < calcWidth; i++) { - DataCopy(dst[dstOffset], src[srcOffset], - { static_cast(height), 1, static_cast(src_gap), 0 }); - dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; - srcOffset += c0Size_; - } - } - - // tail gm->ub pad zero, and then ub->l1 - int32_t tileHeight; - if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) { - tileHeight = MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileHeight(); - } else { - tileHeight = MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileHeight(); - } - auto size = tileHeight * ONE_BLK_SIZE / sizeof(SrcT); - - LocalTensor trans; - if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { - trans = MATMUL_PARAM_VAR.localWorkspace[MATMUL_PARAM_VAR.tiling_.GetTransLength()] - .template ReinterpretCast(); - } else { - trans = MATMUL_PARAM_VAR.localWorkspace[MATMUL_PARAM_VAR.nd2nz0ffset].template ReinterpretCast(); - } - trans.SetSize(size); - - int64_t tailSrcoffset = (int64_t)row * (int64_t)gCol + (int64_t)col + (int64_t)calcWidth * (int64_t)c0Size_; - - // gm->ub - for (auto i = 0; i < height; ++i) { - DataCopy(trans[i * c0Size_], src[tailSrcoffset], { 1, 1, 0, 0 }); - tailSrcoffset += gCol; - } - - event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); - SetFlag(eventIDMte2ToV); - WaitFlag(eventIDMte2ToV); - - // tail pad zero - uint64_t mask[2]; - if constexpr (IsSameType::value) { - tail = Ceil(tail, 2); - } - uint16_t mask_tail = ~((1 << tail) - 1); - uint64_t masktail = mask_tail; - mask[0] = masktail + (masktail << 16) + (masktail << 32) + (masktail << 48); - mask[1] = mask[0]; - if (masktail != 0) { - if constexpr (IsSameType::value) { - LocalTensor tmpTrans = trans.template ReinterpretCast(); - Duplicate(tmpTrans, static_cast(0), mask, Ceil(height, 8), 1, 8); - } else { - Duplicate(trans, static_cast(0), mask, Ceil(height, 8), 1, 8); + MATMUL_MODULE(CopyCubeInParams)->template IsKRowDirec()); } } - - event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIDVToMte3); - WaitFlag(eventIDVToMte3); - - // ub->l1 - int heightAlignBlock = Ceil(height, BLOCK_CUBE); - int tailDstOffset = heightAlignBlock * BLOCK_CUBE * c0Size_ * calcWidth; - DataCopy(dst[tailDstOffset], trans, { static_cast(height), 1, 0, 0 }); - } else { - int src_gap = gCol * sizeof(SrcT) / ONE_BLK_SIZE - 1; - if (gCol % c0Size_ != 0 || src_gap >= UINT16_MAX) { - int64_t oriSrcOffset = srcOffset; - int oriDstOffset = dstOffset; - // each block len is only 32B - for (int i = 0; i < calcWidth; i++) { - for (int j = 0; j < height; j++) { - DataCopy(dst[dstOffset], src[srcOffset], { 1, 1, 0, 0 }); - dstOffset += c0Size_; - srcOffset += gCol; - } - srcOffset = oriSrcOffset + (i + 1) * c0Size_; - dstOffset = oriDstOffset + (i + 1) * calcHeightExr * BLOCK_CUBE * c0Size_; - } - } else { - // data copy stride is aligned - if constexpr (INPUT_TYPE::layout == LayoutMode::NORMAL) { - int32_t loop = height / MAX_BLOCK_COUNT_SIZE; - int32_t loopTail = height % MAX_BLOCK_COUNT_SIZE; - for (int i = 0; i < calcWidth; i++) { - int32_t dstOffsetTmp = dstOffset; - int32_t srcOffsetTmp = srcOffset; - for (int i = 0; i < loop; ++i) { - DataCopy( - dst[dstOffsetTmp], src[srcOffsetTmp], - { static_cast(MAX_BLOCK_COUNT_SIZE), 1, static_cast(src_gap), 0 }); - dstOffsetTmp += MAX_BLOCK_COUNT_SIZE * c0Size_; - srcOffsetTmp += MAX_BLOCK_COUNT_SIZE * gCol; - } - if (loopTail) { - DataCopy(dst[dstOffsetTmp], src[srcOffsetTmp], - { static_cast(loopTail), 1, static_cast(src_gap), 0 }); - } - dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; - srcOffset += c0Size_; - } - } else { - for (int i = 0; i < calcWidth; i++) { - DataCopy(dst[dstOffset], src[srcOffset], - { static_cast(height), 1, static_cast(src_gap), 0 }); - dstOffset += calcHeightExr * BLOCK_CUBE * c0Size_; - srcOffset += c0Size_; - } - } - } - event_t eventIDMte2ToMte1 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE1)); - SetFlag(eventIDMte2ToMte1); - WaitFlag(eventIDMte2ToMte1); - event_t eventIDMte1ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE1_MTE2)); - SetFlag(eventIDMte1ToMte2); - WaitFlag(eventIDMte1ToMte2); } } - template - __aicore__ inline int CopyNDBlock(const LocalTensor& transTensor, const GlobalTensor& src, int64_t srcOffset, - int height, int width, int gCol, bool isBankConflict) - { - ASCENDC_ASSERT((gCol >= width), - { KERNEL_LOG(KERNEL_ERROR, "gCol is %d, which should be no less than %d.", gCol, width); }); - int calcWidth = width / c0Size_; // cube block numbers that do not need to be pad zero - int c0Size = B16_C0SIZE; - if constexpr (sizeof(T) == sizeof(float)) { - c0Size = B32_C0SIZE; - } else if (sizeof(T) == sizeof(int8_t)) { - c0Size = B8_C0SIZE; - } - - // gCol unaligned - if (gCol % c0Size) { - calcWidth = Ceil(CeilAlign(width, c0Size), c0Size_); - int blockLen = CeilAlign(width, c0Size) * sizeof(T) / DEFAULT_C0_SIZE; - int dstOffset = 0; - int BankConflictPadSize = isBankConflict ? (32 / sizeof(T)) : 0; - - // data copy stride is unaligned, need to copy line by line - for (int i = 0; i < height; i++) { - DataCopy(transTensor[dstOffset], src[srcOffset], { 1, static_cast(blockLen), 0, 0 }); - dstOffset += (CeilAlign(width, c0Size) + BankConflictPadSize); - srcOffset += gCol; - } - - auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V); - SetFlag((event_t)enQueEvtID); - WaitFlag((event_t)enQueEvtID); - } else { - int srcStride = (gCol - width) * sizeof(T) / ONE_BLK_SIZE; - int blocklen = Ceil(width * sizeof(T), ONE_BLK_SIZE); - calcWidth = Ceil(CeilAlign(width, c0Size), c0Size_); - if (srcStride >= UINT16_MAX) { - int dstOffset = isBankConflict ? (width + c0Size) : width; - for (int i = 0; i < height; ++i) { - DataCopy(transTensor[i * dstOffset], src[srcOffset], { 1, static_cast(blocklen), 0, 0 }); - srcOffset += gCol; - } - } else { - uint16_t dstStride = isBankConflict ? 1 : 0; - int loopNum = Ceil(static_cast(height), MAX_BLOCK_COUNT_SIZE); - int tailCount = static_cast(height) % MAX_BLOCK_COUNT_SIZE; - for (int i = 0; i < loopNum; ++i) { - uint16_t blockCount = (i == loopNum - 1) ? tailCount : MAX_BLOCK_COUNT_SIZE; - DataCopy(transTensor[i * MAX_BLOCK_COUNT_SIZE * blocklen * ONE_BLK_SIZE / sizeof(T)], - src[srcOffset + i * MAX_BLOCK_COUNT_SIZE * blocklen * ONE_BLK_SIZE / sizeof(T)], - { blockCount, static_cast(blocklen), static_cast(srcStride), - dstStride }); - } - } - auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE2_V); - SetFlag((event_t)enQueEvtID); - WaitFlag((event_t)enQueEvtID); - } - return calcWidth; - } - - template - __aicore__ inline void NDPadZeros(LocalTensor& dst, int height, int calcWidth, int gCol, int width, - bool isBankConflict) - { - if (gCol % BLOCK_CUBE) { - int tail = width % c0Size_; - // tail pad zero - if (tail) { - auto offset = width / c0Size_ * c0Size_; - uint64_t mask[2]; - if constexpr (IsSameType::value) { - tail = Ceil(tail, 2); - offset /= 2; - } - uint16_t mask_tail = ~((1 << tail) - 1); - uint64_t masktail = mask_tail; - mask[0] = masktail + (masktail << 16) + (masktail << 32) + (masktail << 48); - mask[1] = mask[0]; - int stride = calcWidth * (c0Size_ * sizeof(T) / DEFAULT_C0_SIZE); - int32_t totalRep = Ceil(height, 8); - if (masktail != 0) { - if constexpr (IsSameType::value) { - LocalTensor tmpTransTensor = dst.template ReinterpretCast(); - if (stride < 32) { - if (totalRep <= MAX_REPEAT_TIMES) { - Duplicate(tmpTransTensor[offset], (int16_t)0, mask, Ceil(height, 8), stride, 8 * stride); - } else { - int32_t highBlock = totalRep / MAX_REPEAT_TIMES; - int32_t highTail = totalRep % MAX_REPEAT_TIMES; - int64_t dstOffset = calcWidth * BLOCK_CUBE * 8 * MAX_REPEAT_TIMES; - for (int32_t idx = 0; idx < highBlock; ++idx) { - Duplicate(tmpTransTensor[offset], (int16_t)0, mask, - MAX_REPEAT_TIMES, stride, 8 * stride); - offset += dstOffset; - } - if (highTail) { - Duplicate(tmpTransTensor[offset], (int16_t)0, mask, highTail, stride, 8 * stride); - } - } - } else { - for (int32_t i = 0; i < totalRep; ++i) { - Duplicate(tmpTransTensor[offset], (int16_t)0, mask, 1, stride, 0); - offset += stride * BLOCK_CUBE; - } - } - } else { - Duplicate(dst[offset], (T)0, mask, totalRep, stride, 8 * stride); - } - PipeBarrier(); - } - } - } - // If the value of high is not an integer multiple of 16, add 0. - int tailHigh = height % BLOCK_CUBE; - if (tailHigh) { - auto dstOffset = height * calcWidth * BLOCK_CUBE; - if constexpr (IsSameType::value) { - LocalTensor tmpDst = dst.template ReinterpretCast(); - Duplicate(tmpDst[dstOffset], (int16_t)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); - } else { - Duplicate(dst[dstOffset], (T)0, (BLOCK_CUBE - tailHigh) * calcWidth * BLOCK_CUBE); - } - } - } +private: + constexpr static int32_t c0Size_ = AuxGetC0Size(); - __aicore__ inline void NDTrans2NZ(LocalTensor& dst, LocalTensor& src, int calcHigh, int calcWidth, - bool isBankConflict) + __aicore__ bool IsTailTile(int tileHeight, int tileWidth) { - // Use Muls, convert to NZ format - if constexpr (IsSameType::value) { - struct UnaryRepeatParams intriParams; - uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) }; - int blkStride = isBankConflict ? calcWidth + 1 : calcWidth; - intriParams.dstBlkStride = (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); - intriParams.srcBlkStride = blkStride * (c0Size_ * sizeof(SrcT) / DEFAULT_C0_SIZE); - intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM; - intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM; - int dstOffset = 0; - int srcOffset = 0; - // ensure rep stride be less than 256 - constexpr int maxSrcBlkStride = 32; - LocalTensor tmpSrc = src.template ReinterpretCast(); - LocalTensor tmpDst = dst.template ReinterpretCast(); - if (intriParams.srcBlkStride >= maxSrcBlkStride) { - intriParams.dstBlkStride = 1; - intriParams.srcBlkStride = 1; - mask[0] = (1 << BLOCK_CUBE) - 1; - mask[1] = 0; - SetVectorMask(mask[1], mask[0]); - for (int i = 0; i < calcWidth; i++) { - for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) { - dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE; - srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE; - Muls(tmpDst[dstOffset], tmpSrc[srcOffset], (int16_t)1, mask, 1, intriParams); - } - } - } else { - SetVectorMask(mask[1], mask[0]); - int32_t totalRepTimes = 2 * calcHigh; - int32_t highBlock = totalRepTimes / MAX_REPEAT_TIMES; - int32_t highTail = totalRepTimes % MAX_REPEAT_TIMES; - for (int i = 0; i < calcWidth; i++) { - dstOffset = i * calcHigh * CUBE_MAX_SIZE; - srcOffset = i * BLOCK_CUBE; - for (int32_t idx = 0; idx < highBlock; ++idx) { - Muls(tmpDst[dstOffset], - tmpSrc[srcOffset], (int16_t)1, mask, MAX_REPEAT_TIMES, intriParams); - dstOffset += BLOCK_CUBE * MAX_REPEAT_TIMES * 8; - srcOffset += calcWidth * BLOCK_CUBE * MAX_REPEAT_TIMES * 8; - } - if (highTail) { - Muls(tmpDst[dstOffset], - tmpSrc[srcOffset], (int16_t)1, mask, highTail, intriParams); - } - } - } + if (IsTranspose()) { + return GetStaticTileHeight() != tileHeight || GetStaticTileWidth() != tileWidth; } else { - const int c0Count = AscendCUtils::GetC0Count(sizeof(SrcT)); - struct UnaryRepeatParams intriParams; - uint64_t mask[2] = { uint64_t(-1), uint64_t(-1) }; - int32_t padBlock = 1; - if constexpr (IsSameTypeV && IsSameTypeV) { - padBlock = 2; - } - int blkStride = isBankConflict ? calcWidth + padBlock : calcWidth; - intriParams.dstBlkStride = (BLOCK_CUBE * sizeof(SrcT) / DEFAULT_C0_SIZE); - intriParams.srcBlkStride = blkStride * BLOCK_CUBE * sizeof(SrcT) / DEFAULT_C0_SIZE; - intriParams.dstRepStride = intriParams.dstBlkStride * DEFAULT_BLK_NUM; - intriParams.srcRepStride = intriParams.srcBlkStride * DEFAULT_BLK_NUM; - int dstOffset = 0; - int srcOffset = 0; - // ensure rep stride be less than 256 - constexpr int maxSrcBlkStride = 32; - if (intriParams.srcBlkStride >= maxSrcBlkStride) { - intriParams.dstBlkStride = 1; - intriParams.srcBlkStride = 1; - mask[0] = (1 << BLOCK_CUBE) - 1; - mask[1] = 0; - SetVectorMask(mask[1], mask[0]); - for (int i = 0; i < calcWidth; i++) { - for (int j = 0; j < calcHigh * BLOCK_CUBE; ++j) { - dstOffset = i * calcHigh * CUBE_MAX_SIZE + j * BLOCK_CUBE; - srcOffset = j * blkStride * BLOCK_CUBE + i * BLOCK_CUBE; - Muls(dst[dstOffset], src[srcOffset], (SrcT)1, mask, 1, intriParams); - if constexpr (sizeof(SrcT) == sizeof(float)) { - Muls( - dst[dstOffset + c0Count], src[srcOffset + c0Count], (SrcT)1, mask, 1, intriParams); - } - } - } - } else { - SetVectorMask(mask[1], mask[0]); - for (int i = 0; i < calcWidth; i++) { - dstOffset = i * calcHigh * CUBE_MAX_SIZE; - srcOffset = i * BLOCK_CUBE; - Muls(dst[dstOffset], src[srcOffset], (SrcT)1, mask, 2 * calcHigh, intriParams); - if constexpr (sizeof(SrcT) == sizeof(float)) { - Muls( - dst[dstOffset + c0Count], src[srcOffset + c0Count], (SrcT)1, mask, 2 * calcHigh, intriParams); - } - } - } + return GetStaticTileHeight() != tileHeight || GetStaticTileWidth() != tileWidth; } } - __aicore__ inline void CopyND2NZ(const LocalTensor& dst, const GlobalTensor& src, int row, int col, - int height, int width, int gCol, int ndNum = 1, bool kAlignToC0Size = false) - { - LocalTensor transTensor; - transTensor = MATMUL_CONST_PARAM_VAR.localWorkspace[0].template ReinterpretCast(); - transTensor.SetSize(MATMUL_CONST_PARAM_VAR.tiling_.GetTransLength()); - LocalTensor trans; - trans = MATMUL_CONST_PARAM_VAR.localWorkspace[MATMUL_CONST_PARAM_VAR.tiling_.GetTransLength()] - .template ReinterpretCast(); - trans.SetSize(MATMUL_CONST_PARAM_VAR.tiling_.GetTransLength()); - - auto srcOffset = ((int64_t)row * (int64_t)gCol + (int64_t)col); - - bool isBankConflict = Ceil(width, c0Size_) * 32 % 512 == 0 && Ceil(width, c0Size_) < 32 ? true : false; - - int calcHigh = Ceil(height, BLOCK_CUBE); - auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::V_MTE2); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); - int calcWidth = CopyNDBlock(transTensor, src, srcOffset, height, width, gCol, isBankConflict); - int padWidth = isBankConflict ? calcWidth + 1 : calcWidth; - int size = calcHigh * padWidth * BLOCK_CUBE * c0Size_ / AuxGetFactor();; - - transTensor.SetSize(size); - trans.SetSize(size); - const_cast&>(dst).SetSize(size); - - NDPadZeros(transTensor, height, padWidth, gCol, width, isBankConflict); - NDTrans2NZ(trans, transTensor, calcHigh, calcWidth, isBankConflict); - - event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIDVToMte3); - WaitFlag(eventIDVToMte3); - DataCopy(dst, trans, size); - enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); - }; -#endif - -private: -#if __CCE_AICORE__ < 220 - int32_t orgHeight_; // or M - int32_t orgWidth_; // or K - int32_t baseHeight_; // or baseK - int32_t baseWidth_; // or baseM - int32_t stepCol_; -#endif - constexpr static int32_t c0Size_ = AuxGetC0Size(); - - __aicore__ bool IsTailTile(int tileHeight, int tileWidth) + template + __aicore__ inline bool IsTranspose() { - if (MATMUL_MODULE(MatmulShapeInfo)->IsTranspose()) { - return MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileHeight() != tileHeight || - MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileWidth() != tileWidth; + if constexpr(INPUT_TYPE::TAG == InputTypeTag::A) { + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeA(); } else { - return MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileHeight() != tileHeight || - MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileWidth() != tileWidth; + return MATMUL_MODULE(MatmulShapeInfo)->template IsTransposeB(); } } @@ -845,8 +433,8 @@ private: }); int32_t dstNzC0Stride = 0; if constexpr (IsStaticPaddingEnable(MM_CFG)) { - int32_t tileHeight = MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileHeight(); - int32_t tileWidth = MATMUL_MODULE(CopyCubeInParams)->template GetStaticTileWidth(); + int32_t tileHeight = GetStaticTileHeight(); + int32_t tileWidth = GetStaticTileWidth(); if (tileHeight != height || tileWidth != width) { StaticPadNd2Nz(dst, tileHeight, tileWidth, height, width); dstNzC0Stride = tileHeight; @@ -934,5 +522,7 @@ private: } #endif }; -} // namespace matmul +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_DATA_COPY_WRAPPER_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h index 994f8d43402d58ed9e2c6cc0d10c0be7f1b845b5..6724203ec5e7877d924f98fd9b1b73bc09f263d5 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h @@ -1,1005 +1,1060 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_out_datacopy.h - * \brief - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H - -#include "../../matmul_module.h" -#include "../../matmul_param.h" -#include "../../resource/cube_out_buffer/cube_out_buffer.h" -#include "../../matmul_var.h" -#include "copy_cube_intf.h" - -namespace matmul { - -constexpr int DOUBLE_SPACE = 2; -constexpr int TRIPLE_SPACE = 3; -constexpr int TWO_TIMES = 2; -constexpr int EIGHT_TIMES = 8; -constexpr int SHIFT_16_BIT = 16; -constexpr int SHIFT_32_BIT = 32; -constexpr int SHIFT_48_BIT = 48; -constexpr uint32_t MAX_REPEAT_STRIDE = 255; -constexpr int PATTERN_SIZE = 8; -constexpr int PATTERN_OFFSET = 2; - -template -class CopyCubeOut -{ - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; - - MATMUL_USE_MODULE(CubeOutBuffer); - MATMUL_USE_MODULE(QuantProcessor); - MATMUL_USE_MODULE(MatmulVarC); - MATMUL_USE_MODULE(MatmulShapeInfoC); - -public: - __aicore__ inline void CopyOut(const GlobalTensor& gm, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - CopyOutImpl(gm, co1Local, curM, curN, enSequentialWrite); - } - - __aicore__ inline void CopyOut(const LocalTensor& co2Local, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - CopyOutImpl(co2Local, co1Local, curM, curN, enSequentialWrite); - } - - __aicore__ inline void CopyOut(const GlobalTensor& gm, const LocalTensor& co2Local, - const LocalTensor& co1Local, int curM, int curN, bool enSequentialWrite) {} - -private: - template - __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyOutNZ2ND(dst, co1Local, curM, curN, enSequentialWrite); - } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - CopyOutNZ2NZ(dst, co1Local, curM, curN, enSequentialWrite); - } else { - ASCENDC_ASSERT(false, {KERNEL_LOG(KERNEL_ERROR, "CopyOut: unsupport Matmul format type.");}); - } - } - - __aicore__ inline void CopyOutNZ2NZ(const LocalTensor& co2Local, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ >= MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight()), { - KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be not less than baseM %d", - MATMUL_CAST_TO_IMPL()->M_, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight()); - }); - - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(); - dataCopyInfo.srcStride = 0; - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - if (enSequentialWrite) { - dataCopyInfo.dstStride = 0; - CopyCo12Co2WithQuant(co2Local, co1Local, curN, dataCopyInfo, enhancedParams); - } else { - dataCopyInfo.dstStride = (Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetSingleHeight(), BLOCK_CUBE) * BLOCK_CUBE - - MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE) * BLOCK_CUBE * sizeof(DstT) / - ONE_BLK_SIZE; - int dstOffset = curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * BLOCK_CUBE + - curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth() * MATMUL_CAST_TO_IMPL()->M_; - CopyCo12Co2WithQuant(co2Local[dstOffset], co1Local, curN, dataCopyInfo, enhancedParams); - } - } - - __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventIDMte3ToV); - WaitFlag(eventIDMte3ToV); - - LocalTensor localBuf = GetLocalBuf(); - CopyCo12Local(localBuf, co1Local, curN); - - event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIDVToMte3); - WaitFlag(eventIDVToMte3); - - CopyLocal2GMNZ2NZ(gm, localBuf, curM, curN, enSequentialWrite); - } - - __aicore__ inline void CopyOutNZ2ND(const LocalTensor& co2Local, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ == 1), - { KERNEL_LOG(KERNEL_ERROR, "M_ is %d, which should be equal with 1.", MATMUL_CAST_TO_IMPL()->M_); }); - - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * - MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; - - if (enSequentialWrite) { - DataCopy(co2Local, co1Local, dataCopyInfo, enhancedParams); - } else { - int dstOffset = curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth(); - DataCopy(co2Local[dstOffset], co1Local, dataCopyInfo, enhancedParams); - } - } else { - ASCENDC_ASSERT((!IsSameType::value && !IsSameType::value), - { KERNEL_LOG(KERNEL_ERROR, "Data format should be NZ if GetTensorC to UB when output is int8_t."); }); - ASCENDC_ASSERT((MATMUL_MODULE(MatmulVarC)->GetCacheUBWorkspaceAddr() != nullptr), { KERNEL_LOG(KERNEL_ERROR, - "do not give ub workspace, Data format should be NZ if GetTensorC to UB."); }); - - LocalTensor trans; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - MATMUL_MODULE(MatmulVarC)->GetCo2Offset() += MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE; - } - trans = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetCo2Offset()].template ReinterpretCast(); - trans.SetSize(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - - CopyCo12Co2WithoutQuant(trans, co1Local, curN); - - if (enSequentialWrite) { - TransNZ2NDForDstUB(co2Local, trans, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - } else { - uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; - int dstOffset = curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * dimN + - curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth(); - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - TransNZ2NDForDstUB(co2Local[dstOffset], trans, Ceil(dimN, blockCount) * blockCount); - } - } - } - - __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) - { - event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventIDMte3ToV); - WaitFlag(eventIDMte3ToV); - - LocalTensor localBuf = GetLocalBuf(); - CopyCo12Local(localBuf, co1Local, curN); - - event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIDVToMte3); - WaitFlag(eventIDVToMte3); - - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - CopyLocal2GMNZ2NZ(gm, localBuf, curM, curN, enSequentialWrite); - } else if (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ || - IsSameType::value && IsSameType::value) { - CopyLocal2GMNZ2NDOnTheFly(gm, localBuf, curM, curN, enSequentialWrite); - } else { - CopyLocal2GMNZ2NDByVec(gm, localBuf, curM, curN, enSequentialWrite); - } - } - } - - __aicore__ inline void CopyCo12Co2WithQuant(const LocalTensor& dst, const LocalTensor& src, - int curN, DataCopyParams& dataCopyInfo, DataCopyEnhancedParams& enhancedParams) - { - if constexpr (IsSameType::value) { - UpdateDataCopyParamForQuant(enhancedParams, curN); - uint64_t alignedHeight = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE; - if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(); - uint64_t addr = enhancedParams.deqTensorAddr; - for (int i = 0; i < Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), TWO_TIMES); ++i) { - for (int storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() % TWO_TIMES != 0 && - i == Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), TWO_TIMES) - 1 && - storeMode == 1) { - continue; - } - enhancedParams.deqTensorAddr = addr + i * ONE_BLK_SIZE * ONE_BYTE_BIT_SIZE + storeMode * BLOCK_CUBE * ONE_BYTE_BIT_SIZE; - enhancedParams.sidStoreMode = (uint8_t)storeMode; - DataCopy(dst[i * ONE_BLK_SIZE * alignedHeight], - src[i * ONE_BLK_SIZE * alignedHeight + storeMode * BLOCK_CUBE * alignedHeight], - dataCopyInfo, enhancedParams); - } - } - } else if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(); - uint64_t addr = enhancedParams.deqTensorAddr; - for (int i = 0; i < Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), TWO_TIMES); ++i) { - for (int storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() % TWO_TIMES != 0 && - i == Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), TWO_TIMES) - 1 && - storeMode == 1) { - continue; - } - enhancedParams.sidStoreMode = (uint8_t)storeMode; - DataCopy(dst[i * ONE_BLK_SIZE * alignedHeight], - src[i * ONE_BLK_SIZE * alignedHeight + storeMode * BLOCK_CUBE * alignedHeight], - dataCopyInfo, enhancedParams); - } - } - } else if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(); - dataCopyInfo.dstStride = 0; - uint64_t addr = enhancedParams.deqTensorAddr; - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); ++i) { - constexpr int DEQ_OFFSET = 128; - enhancedParams.deqTensorAddr = addr + i * DEQ_OFFSET; - DataCopy(dst[i * BLOCK_CUBE * alignedHeight], src[i * BLOCK_CUBE * alignedHeight], dataCopyInfo, enhancedParams); - } - } else { - DataCopy(dst, src, dataCopyInfo, enhancedParams); - } - } else { - DataCopy(dst, src, dataCopyInfo, enhancedParams); - } - } - - __aicore__ inline void CopyCo12Co2WithoutQuant(const LocalTensor& dst, const LocalTensor& src, int curN) - { - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(); - dataCopyInfo.srcStride = 0; - dataCopyInfo.dstStride = 0; - DataCopyEnhancedParams enhancedParams; - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - if constexpr (IsSameType::value) { - UpdateDataCopyParamForQuant(enhancedParams, curN); - } - DataCopy(dst, src, dataCopyInfo, enhancedParams); - } - - __aicore__ inline void CopyCo12Local(const LocalTensor& localBuf, const LocalTensor& co1Local, int curN) - { - DataCopyParams dataCopyInfo; - dataCopyInfo.blockCount = 1; - dataCopyInfo.blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * - MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - DataCopyEnhancedParams enhancedParams; - if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; - } else { - enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; - ASCENDC_ASSERT((localBuf.GetSize() >= dataCopyInfo.blockLen * CUBE_MAX_SIZE), { - KERNEL_LOG(KERNEL_ERROR, "copy len is %d, which should be less than dst size %d", - dataCopyInfo.blockLen * CUBE_MAX_SIZE, localBuf.GetSize()); - }); - } - CopyCo12Co2WithQuant(localBuf, co1Local, curN, dataCopyInfo, enhancedParams); - } - - __aicore__ inline void CopyLocal2GMNZ2NZNotSeq(const GlobalTensor& gm, const LocalTensor& localBuf, - int curM, int curN) - { - ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ >= MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight()), { - KERNEL_LOG(KERNEL_ERROR, "M_ is %d, baseUseM_ is %d, M_ should be no less than baseUseM_", - MATMUL_CAST_TO_IMPL()->M_, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight()); - }); - int64_t alignM; - int alignBaseUseM; - if constexpr (C_TYPE::format == CubeFormat::NZ) { - // nz2nz - alignM = Ceil(MATMUL_CAST_TO_IMPL()->M_, BLOCK_CUBE) * BLOCK_CUBE; - alignBaseUseM = Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(), BLOCK_CUBE) * BLOCK_CUBE; - } else { - // nz2nd A is vector - alignM = MATMUL_CAST_TO_IMPL()->M_; - alignBaseUseM = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); - } - - int64_t dstOffset; - int64_t dstStride; - int blockLen; - int blockCount; - if constexpr (IsSameType::value || IsSameType::value) { - dstOffset = curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth() * alignM + - curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * ONE_BLK_SIZE; - dstStride = (alignM - alignBaseUseM) * sizeof(DstT); - blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE * sizeof(DstT); - blockCount = Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), TWO_TIMES); - } else { - dstOffset = curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth() * alignM + - curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * BLOCK_CUBE; - dstStride = (alignM - alignBaseUseM) * sizeof(DstT) * BLOCK_CUBE / ONE_BLK_SIZE; - blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE * sizeof(DstT) * - BLOCK_CUBE / ONE_BLK_SIZE; - blockCount = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - } - - if (dstStride >= UINT16_MAX) { - int srcStride; - if constexpr (IsSameType::value || IsSameType::value) { - dstStride = alignM * ONE_BLK_SIZE; - srcStride= MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight() * ONE_BLK_SIZE; - } else { - dstStride = alignM * BLOCK_CUBE; - srcStride = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight() * BLOCK_CUBE; - } - for (int i = 0; i < blockCount; ++i) { - DataCopy(gm[dstOffset + i * dstStride], localBuf[i * srcStride], { 1, static_cast(blockLen), 0, 0 }); - } - } else { - DataCopy(gm[dstOffset], localBuf, { static_cast(blockCount), static_cast(blockLen), 0, - static_cast(dstStride) }); - } - } - - __aicore__ inline void CopyLocal2GMNZ2NZSeq(const GlobalTensor& gm, const LocalTensor& localBuf) - { - int blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight() * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; - DataCopy(gm, localBuf, { static_cast(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth()), - static_cast(blockLen), 0, 0 }); - } - - __aicore__ inline void CopyLocal2GMNZ2NZ(const GlobalTensor& gm, const LocalTensor& localBuf, - int curM, int curN, bool enSequentialWrite) - { - if (enSequentialWrite) { - CopyLocal2GMNZ2NZSeq(gm, localBuf); - } else { - CopyLocal2GMNZ2NZNotSeq(gm, localBuf, curM, curN); - } - } - - __aicore__ inline void TransNZ2NDForDstUB(const LocalTensor& co2Local, const LocalTensor& trans, - int dstStride) - { - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - DataCopyParams dataCopyInfo { - static_cast(MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth()), - static_cast(blockCount * sizeof(DstT) / ONE_BLK_SIZE), - static_cast((MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE * blockCount - - blockCount) * sizeof(DstT) / ONE_BLK_SIZE), - 0 - }; - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); i++) { - DataCopy(co2Local[i * dstStride], trans[i * blockCount], dataCopyInfo); - } - } - - __aicore__ inline void CopyLocal2GMNZ2NDByVec(const GlobalTensor& gm, const LocalTensor& localBuf, - int curM, int curN, bool enSequentialWrite) - { - uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - - LocalTensor trans; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - trans = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * TRIPLE_SPACE].template ReinterpretCast(); - } else { - trans = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength()].template ReinterpretCast(); - } - int transSize = localBuf.GetSize(); - if constexpr (IsSameType::value || IsSameType::value) { - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() % TWO_TIMES != 0) { - transSize += MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * CUBE_MAX_SIZE; - } - } - trans.SetSize(transSize); - - int dstOffset; - int dstStride; - int offset; - bool isGmAligned; - if (enSequentialWrite) { - dstOffset = 0; - dstStride = 0; - offset = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(); - isGmAligned = ((MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount) == 0); - } else { - int width = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() * blockCount; - if constexpr (IsSameType::value || IsSameType::value) { - width = width / TWO_TIMES; - } - ASCENDC_ASSERT((dimN >= width), - { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); - if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { - isGmAligned = 1; - } else { - isGmAligned = ((dimN % blockCount) == 0 && (MATMUL_MODULE(MatmulShapeInfoC)->GetSingleWidth() % blockCount) == 0); - } - - dstOffset = curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * dimN + curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth(); - dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; - offset = dimN; - } - bool isTargetAligned = (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount) == 0; - const bool isComputeLineByLine = (!isGmAligned || dstStride >= UINT16_MAX); - - // 1 if target is not aligned, must copy the unalign data to trans UB - if constexpr (IsSameType::value) { - bool isOdd = false; - if constexpr (IsSameType::value || IsSameType::value) { - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % TWO_TIMES > 0) { - isOdd = true; - } - } - bool isSingleCore = MATMUL_CAST_TO_IMPL()->M_ <= MATMUL_MODULE(MatmulShapeInfoC)->GetSingleHeight() && dimN <= MATMUL_MODULE(MatmulShapeInfoC)->GetSingleWidth(); - bool isMutiCoreNeedPad = !isSingleCore && !isComputeLineByLine; - if (!isTargetAligned && (isSingleCore || isMutiCoreNeedPad) && !isOdd) { - PadUnalignedToTrans(trans, gm, dstOffset, isComputeLineByLine, enSequentialWrite); - } - } else { - if (!isTargetAligned) { - PadUnalignedToTrans(trans, gm, dstOffset, isComputeLineByLine, enSequentialWrite); - } - } - - // 2. trans nz buffer to nd buffer - TransNZ2NDByVec(trans, localBuf, MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight(), MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(), (DstT)1.0); - - // 3. copy trans buffer to gm - int blockLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() * (blockCount * sizeof(DstT) / ONE_BLK_SIZE); - if constexpr (IsSameType::value || IsSameType::value) { - blockLen = Ceil(blockLen, TWO_TIMES); - } - if (isComputeLineByLine) { - if constexpr (IsSameType::value) { - if (!enSequentialWrite) { - dstOffset = curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * MATMUL_CAST_TO_IMPL()->N_ + curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth(); - offset = MATMUL_CAST_TO_IMPL()->N_; - } - int newBlockCount; - if constexpr (IsSameType::value || IsSameType::value) { - newBlockCount = BLOCK_CUBE; - } else { - newBlockCount = ONE_BLK_SIZE / sizeof(DstT); - } - if (isTargetAligned) { - CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, blockLen, newBlockCount, offset); - } else { - if (blockLen == 1) { - CopyTrans2GMByVecByLineUnalignOneBlock(gm[dstOffset], trans, blockLen, newBlockCount, offset); - } else { - if constexpr (IsSameType::value || IsSameType::value) { - CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, blockLen, newBlockCount, offset); - } else { - CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, blockLen, newBlockCount, offset); - } - } - } - } else { - CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, blockLen, ONE_BLK_SIZE / sizeof(DstT), offset); - } - } else { - CopyTrans2GMByVecNormal(gm[dstOffset], trans, blockLen, dstStride); - } - } - - __aicore__ inline void PadUnalignedToTrans(const LocalTensor& trans, const GlobalTensor& gm, - int dstOffset, bool isComputeLineByLine, bool enSequentialWrite) - { - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - int32_t alignedSize; - if constexpr (IsSameType::value || IsSameType::value) { - alignedSize = GetC0Size(); - } else { - alignedSize = BLOCK_CUBE; - } - int baseUseN = Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(), alignedSize) * alignedSize; - int gmTailOffset = dstOffset + baseUseN - blockCount; - int transTailOffset = baseUseN - blockCount; - - auto enQueEvtID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); - - if (isComputeLineByLine) { - if (enSequentialWrite) { - PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, baseUseN); - } else { - PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, MATMUL_CAST_TO_IMPL()->N_); - } - } else { - PadUnalignedToTransWithStride(trans[transTailOffset], gm[gmTailOffset]); - } - - // if copy gm to ub, must add the set/wait flag to wait the UB has be writed; - event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); - SetFlag(eventIDMte2ToV); - WaitFlag(eventIDMte2ToV); - } - - __aicore__ inline void PadUnalignedToTransByLine(const LocalTensor& trans, const GlobalTensor& gm, - int transStride, int gmStride) - { - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - // copy gm to trans one line by one line - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); ++i) { - DataCopy(trans[i * transStride], gm[i * gmStride], { static_cast(1), - static_cast(blockCount * sizeof(DstT) / ONE_BLK_SIZE), 0, 0 }); - } - } - - __aicore__ inline void PadUnalignedToTransWithStride(const LocalTensor& trans, const GlobalTensor& gm) - { - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - // copy gm to trans with stride - DataCopy(trans, gm, { static_cast(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight()), static_cast(1), - static_cast(MATMUL_CAST_TO_IMPL()->N_ / blockCount - 1), static_cast(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() / blockCount) }); - } - - __aicore__ inline auto TransNZ2NDByVec(const LocalTensor& trans, const LocalTensor& localBuf, - int blockHigh, int blockWidth, DstT scalar) - { - event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventIDMte3ToV); - WaitFlag(eventIDMte3ToV); - // B32's block count is 16 - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); - ASCENDC_ASSERT(((blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE) <= MAX_REPEAT_TIMES), { - KERNEL_LOG(KERNEL_ERROR, "blockWidth is %d, blockCount is %d, repeat time exceed max time %d", blockWidth, - blockCount, MAX_REPEAT_TIMES); - }); - if constexpr (IsSameType::value || IsSameType::value) { - struct UnaryRepeatParams intriParams; - int widthAlign = TWO_TIMES; - int offsetWidth = Ceil(blockWidth, widthAlign) * widthAlign; - intriParams.dstBlkStride = Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(), ONE_BLK_SIZE); - intriParams.srcBlkStride = 1; - uint32_t dstRepStride = Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() * sizeof(DstT), ONE_BLK_SIZE) * EIGHT_TIMES; - intriParams.dstRepStride = dstRepStride; - bool isBeyondMaxStride = false; - if (dstRepStride > MAX_REPEAT_STRIDE) { - isBeyondMaxStride = true; - } - intriParams.srcRepStride = (blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - int dstOffset = 0; - int srcOffset = 0; - int highBlock = MAX_REPEAT_TIMES; - int highBlocks = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES / highBlock; - int highTail = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES % highBlock; - uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; - // mov src to dst width aligned - LocalTensor tmpSrc = localBuf.template ReinterpretCast(); - LocalTensor tmpDst = trans.template ReinterpretCast(); - SetVectorMask(mask[1], mask[0]); - constexpr int64_t srcOffsetStride = BLOCK_CUBE * EIGHT_TIMES; - const int64_t dstOffsetStride = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() * BLOCK_CUBE * EIGHT_TIMES / - TWO_TIMES; - for (int i = 0; i < Ceil(blockWidth, TWO_TIMES); ++i) { - if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { - // if the MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() is not aligned, set the mask value; - if (i == (Ceil(blockWidth, TWO_TIMES) - 1) && (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount != 0)) { - uint64_t masktail = (1 << (Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount, TWO_TIMES))) - 1; - mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); - mask[1] = mask[0]; - SetVectorMask(mask[1], mask[0]); - } - } - int dstMulsOffset = dstOffset; - for (int j = 0; j < highBlocks; ++j) { - Muls(tmpDst[dstMulsOffset], tmpSrc[srcOffset], (int16_t)scalar, mask, - highBlock, intriParams); - srcOffset += highBlock * BLOCK_CUBE; - dstMulsOffset += blockWidth * blockCount * highBlock; - } - if (highTail) { - if (isBeyondMaxStride) { - int tmpSrcOffset = srcOffset; - for (int j = 0; j < highTail; j++) { - Muls(tmpDst[dstMulsOffset], - tmpSrc[tmpSrcOffset], (int16_t)scalar, mask, 1, intriParams); - dstMulsOffset += dstOffsetStride; - tmpSrcOffset += srcOffsetStride; - } - } else { - Muls(tmpDst[dstMulsOffset], tmpSrc[srcOffset], (int16_t)scalar, mask, - highTail, intriParams); - } - srcOffset += highTail * BLOCK_CUBE * EIGHT_TIMES; - } - dstOffset += BLOCK_CUBE; - } - } else { - struct UnaryRepeatParams intriParams; - - int dstOffset = 0; - int srcOffset = 0; - int highBlock = MAX_REPEAT_TIMES; - int highBlocks = 0; - int highTail = 0; - int32_t srcStride = highBlock * blockCount; - int32_t dstStride = blockWidth * blockCount * highBlock; - bool isBeyondMaxStride = false; - uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; - - if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { - intriParams.dstBlkStride = 1; - intriParams.srcBlkStride = 1; - intriParams.dstRepStride = blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE; - intriParams.srcRepStride = blockCount * sizeof(DstT) / ONE_BLK_SIZE; - highBlocks = (blockHigh * blockCount) / highBlock; - highTail = (blockHigh * blockCount) % highBlock; - mask[0] = static_cast((1<< blockCount) - 1); - mask[1] = 0; - } else { - intriParams.dstBlkStride = blockWidth; - intriParams.srcBlkStride = 1; - uint32_t dstRepStride = (blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - intriParams.dstRepStride = dstRepStride; - if (dstRepStride > MAX_REPEAT_STRIDE) { - isBeyondMaxStride = true; - } - intriParams.srcRepStride = (blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; - highBlocks = (blockHigh * blockCount) / EIGHT_TIMES / highBlock; - highTail = (blockHigh * blockCount) / EIGHT_TIMES % highBlock; - srcStride *= EIGHT_TIMES; - dstStride *= EIGHT_TIMES; - } - SetVectorMask(mask[1], mask[0]); - for (int i = 0; i < blockWidth; ++i) { - if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { - // if the MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() is not aligned, set the mask value; - if (i == (blockWidth - 1) && (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount != 0)) { - uint64_t masktail = (1 << (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount)) - 1; - mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); - mask[1] = mask[0]; - SetVectorMask(mask[1], mask[0]); - } - } - int dstMulsOffset = dstOffset; - for (int j = 0; j < highBlocks; ++j) { - Muls(trans[dstMulsOffset], localBuf[srcOffset], scalar, mask, highBlock, intriParams); - srcOffset += srcStride; - dstMulsOffset += dstStride; - } - if (highTail) { - if (isBeyondMaxStride) { - const int64_t srcOffsetStride = blockCount * EIGHT_TIMES; - const int64_t dstOffsetStride = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth() * BLOCK_CUBE * EIGHT_TIMES; - for (int j = 0; j < highTail; j++) { - Muls(trans[dstMulsOffset + j * dstOffsetStride], - localBuf[srcOffset + j * srcOffsetStride], scalar, mask, 1, intriParams); - } - } else { - Muls(trans[dstMulsOffset], localBuf[srcOffset], scalar, mask, highTail, intriParams); - } - if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { - srcOffset += highTail * blockCount; - } else { - srcOffset += highTail * blockCount * EIGHT_TIMES; - } - } - dstOffset += blockCount; - } - } - event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIDVToMte3); - WaitFlag(eventIDVToMte3); - } - - __aicore__ inline void CopyTrans2GMByVecByLineAlign(const GlobalTensor& gm, const LocalTensor& trans, - int blockLen, int blockCount, int offset) - { - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); ++i) { - DataCopy(gm[i * offset], trans[i * blockLen * blockCount], - { 1, static_cast(blockLen), 0, 0 }); - PipeBarrier(); - } - } - - __aicore__ inline void CopyTrans2GMByVecByLineUnalignOneBlock(const GlobalTensor& gm, const LocalTensor& trans, - int blockLen, int blockCount, int offset) - { - auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); - SetFlag(eventIDVToS); - WaitFlag(eventIDVToS); - int padLen = (ONE_BLK_SIZE - MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() * sizeof(DstT)) / sizeof(DstT); - SetAtomicAdd(); - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); ++i) { - LocalTensor transAligin; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE].template ReinterpretCast(); - } else { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[0].template ReinterpretCast(); - } - int transIndex = i * blockLen * blockCount; - for (int j = 0; j < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(); ++j) { - transAligin.SetValue(j, trans.GetValue(transIndex + j)); - } - for (int j = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(); j < blockCount; ++j) { - transAligin.SetValue(j, 0); - } - DataCopy(gm[i * offset], transAligin, { 1, 1, 0, 0 }); - auto eventIDMTE3ToS = GetTPipePtr()->FetchEventID(HardEvent::MTE3_S); - SetFlag(eventIDMTE3ToS); - WaitFlag(eventIDMTE3ToS); - } - SetAtomicNone(); - } - - template - __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, - int blockLen, int blockCount, int offset) -> enable_if_t - { - LocalTensor transAligin; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE].template ReinterpretCast(); - } else { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[0].template ReinterpretCast(); - } - int remainLen = (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount) / TWO_TIMES; - auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); - SetFlag(eventIDVToS); - WaitFlag(eventIDVToS); - LocalTensor src1Pattern; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - src1Pattern = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE + MATMUL_MODULE(MatmulVarC)->GetTransLength() / TWO_TIMES] - .template ReinterpretCast(); - } else { - src1Pattern = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() / TWO_TIMES].template ReinterpretCast(); - } - LocalTensor tmpSrc = trans.template ReinterpretCast(); - src1Pattern.SetSize(PATTERN_SIZE); - src1Pattern.SetValue(0, 0xFFFF << remainLen); - src1Pattern.SetValue(1, (1 << remainLen) - 1); - for (int i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { - src1Pattern.SetValue(i, 0); - } - int orinRemain = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; - int gmOffset = blockCount * (blockLen - PATTERN_OFFSET); - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); ++i) { - DataCopy(gm[i * offset], trans[i * blockLen * blockCount], - { 1, static_cast(blockLen - 1), 0, 0 }); - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % TWO_TIMES == 0) { - auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); - GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); - uint64_t rsvdCnt = 0; - GatherMask(transAligin, tmpSrc[((i + 1) * blockLen - PATTERN_OFFSET) * BLOCK_CUBE], src1Pattern, - false, 0, gatherMaskParams, rsvdCnt); - LocalTensor tmpTrans = transAligin.template ReinterpretCast(); - DataCopy(gm[i * offset + gmOffset + remainLen * DOUBLE_SPACE], tmpTrans, { 1, 1, 0, 0 }); - PipeBarrier(); - } else { - auto eventIDMTE3ToS = GetTPipePtr()->FetchEventID(HardEvent::MTE3_S); - SetFlag(eventIDMTE3ToS); - WaitFlag(eventIDMTE3ToS); - LocalTensor tmpTrans = transAligin.template ReinterpretCast(); - for (int j = 0; j < ONE_BLK_SIZE; ++j) { - tmpTrans.SetValue(j, trans[((i + 1) * blockLen - PATTERN_OFFSET) * blockCount + orinRemain].GetValue(j)); - } - auto eventIDSToMTE3 = GetTPipePtr()->FetchEventID(HardEvent::S_MTE3); - SetFlag(eventIDSToMTE3); - WaitFlag(eventIDSToMTE3); - DataCopy(gm[i * offset + gmOffset + orinRemain], tmpTrans, { 1, 1, 0, 0 }); - PipeBarrier(); - } - } - } - - template - __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, - int blockLen, int blockCount, int offset) -> enable_if_t - { - LocalTensor transAligin; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE].template ReinterpretCast(); - } else { - transAligin = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[0].template ReinterpretCast(); - } - int remainLen = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; - auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); - SetFlag(eventIDVToS); - WaitFlag(eventIDVToS); - LocalTensor src1Pattern; - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - src1Pattern = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE + MATMUL_MODULE(MatmulVarC)->GetTransLength() / TWO_TIMES] - .template ReinterpretCast(); - } else { - src1Pattern = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() / TWO_TIMES].template ReinterpretCast(); - } - src1Pattern.SetSize(PATTERN_SIZE); - src1Pattern.SetValue(0, 0xFFFF << remainLen); - src1Pattern.SetValue(1, (1 << remainLen) - 1); - for (int i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { - src1Pattern.SetValue(i, 0); - } - int gmOffset = blockCount * (blockLen - PATTERN_OFFSET); - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); ++i) { - DataCopy(gm[i * offset], trans[i * blockLen * blockCount], - { 1, static_cast(blockLen - 1), 0, 0 }); - GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); - uint64_t rsvdCnt = 0; - auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); - SetFlag(enQueEvtID); - WaitFlag(enQueEvtID); - GatherMask(transAligin, trans[((i + 1) * blockLen - PATTERN_OFFSET) * blockCount], - src1Pattern, false, 0, gatherMaskParams, rsvdCnt); - DataCopy(gm[i * offset + gmOffset + remainLen], transAligin, { 1, 1, 0, 0 }); - PipeBarrier(); - } - } - - __aicore__ inline void CopyTrans2GMByVecNormal(const GlobalTensor& gm, const LocalTensor& trans, - int blockLen, int dstStride) - { - DataCopy(gm, trans, { static_cast(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight()), static_cast(blockLen), 0, - static_cast(dstStride) }); - } - - __aicore__ inline void CopyLocal2GMNZ2NDOnTheFly(const GlobalTensor& gm, const LocalTensor& localBuf, - int curM, int curN, bool enSequentialWrite) - { - uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; - constexpr int oneBlockCount = ONE_BLK_SIZE / sizeof(DstT); - constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : oneBlockCount; - int calcWidth = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() / blockCount; - int dstOffset = curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * dimN + curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth(); - int blockLen = blockCount * sizeof(DstT) / ONE_BLK_SIZE; - int srcRepeatGap = (MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongHeight() * BLOCK_CUBE * blockCount - blockCount) * sizeof(DstT) / ONE_BLK_SIZE; - int tail = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; - LocalTensor trans; - if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { - trans = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength()].template ReinterpretCast(); - } else { - trans = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransOffset()].template ReinterpretCast(); - } - trans.SetSize(blockCount); - - int offset = dimN; - if (enSequentialWrite) { - dstOffset = 0; - offset = MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth(); - } - - if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { - offset = Ceil(offset, blockCount) * blockCount; - calcWidth = MATMUL_MODULE(MatmulShapeInfoC)->GetBlockNumAlongWidth(); - tail = 0; - } - - // Allocate MTE2_MTE3 eventId: eventIDMte3ToMte2 - event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->AllocEventID()); - - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(); i++) { - if (calcWidth > 0) { - DataCopy(gm[dstOffset + i * offset], localBuf[i * blockCount], - { static_cast(calcWidth), static_cast(blockLen), - static_cast(srcRepeatGap), 0 }); - if constexpr (IsSameType::value && - IsSameType::value) { - PipeBarrier(); - } - } - - if (tail != 0) { - int srcTailOffset = i * blockCount + calcWidth * blockCount * Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(), blockCount) * blockCount; - if (MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() * sizeof(DstT) > ONE_BLK_SIZE) { - int dstTailOffset = dstOffset + i * offset + calcWidth * blockCount; - int basicOffset = 0; - if (sizeof(DstT) == B32_BYTE_SIZE) { - DataCopy(gm[dstTailOffset], localBuf[srcTailOffset], { 1, 1, 0, 0 }); - basicOffset = oneBlockCount; - } - - // reg_mov - srcTailOffset = srcTailOffset + basicOffset - - blockCount * Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(), blockCount) * blockCount + MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; - dstTailOffset = dstTailOffset + basicOffset + MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount - blockCount; - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); - SetFlag(eventID); - WaitFlag(eventID); - } - int j = 0; - for (int i = 0; i < blockCount - MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; j++, i++) { - DstT scalar = localBuf.GetValue(srcTailOffset + i); - trans.SetValue(j, scalar); - } - srcTailOffset = i * blockCount + calcWidth * blockCount * Ceil(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight(), blockCount) * blockCount; - for (int i = 0; i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth() % blockCount; j++, i++) { - DstT scalar = localBuf.GetValue(srcTailOffset + i); - trans.SetValue(j, scalar); - } - - event_t eventIDSToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3)); - SetFlag(eventIDSToMte3); - WaitFlag(eventIDSToMte3); - // copy the tail from ub to gm - DataCopy(gm[dstTailOffset], trans, { 1, 1, 0, 0 }); - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S)); - SetFlag(eventID); - WaitFlag(eventID); - } - } else { - if (i > 0) { - WaitFlag(eventIDMte3ToMte2); - } - if constexpr (IsSameType::value && - IsSameType::value) { - event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); - SetFlag(eventID); - WaitFlag(eventID); - } - DataCopy(trans, gm[dstOffset + i * offset + MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth()], { 1, 1, 0, 0 }); - event_t eventIDMte2ToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); - SetFlag(eventIDMte2ToMte3); - WaitFlag(eventIDMte2ToMte3); - DataCopy(gm[dstOffset + i * offset], localBuf[srcTailOffset], { 1, 1, 0, 0 }); - PipeBarrier(); - DataCopy(gm[dstOffset + i * offset + MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth()], trans, { 1, 1, 0, 0 }); - if (i < MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseHeight() - 1) { - SetFlag(eventIDMte3ToMte2); - } - } - } - } - event_t eventID = static_cast(GetTPipePtr()->FetchEventID()); - SetFlag(eventID); - WaitFlag(eventID); - // Release MTE2_MTE3 eventId: eventIDMte3ToMte2 - GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte2); - } - - __aicore__ inline LocalTensor GetLocalBuf() - { - LocalTensor localBuf; - if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - localBuf = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE].template ReinterpretCast(); - } else { - localBuf = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[0].template ReinterpretCast(); - } - } else { - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - MATMUL_MODULE(MatmulVarC)->GetCo2Offset() += MATMUL_MODULE(MatmulVarC)->GetTransLength() * DOUBLE_SPACE; - } - localBuf = MATMUL_MODULE(MatmulVarC)->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetCo2Offset()].template ReinterpretCast(); - } - localBuf.SetSize(MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - return localBuf; - } - - __aicore__ inline void UpdateDataCopyParamForQuant(DataCopyEnhancedParams& enhancedParams, int curN) - { - if constexpr (IsSameType::value) { - if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::DEQF16) { - enhancedParams.deqScale = DeqScale::DEQ16; - enhancedParams.deqValue = MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue(); - } else if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { - enhancedParams.deqScale = DeqScale::VDEQ16; - LocalTensor quantLocalTensor; - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(quantLocalTensor, curN, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - enhancedParams.deqTensorAddr = reinterpret_cast(quantLocalTensor.GetPhyAddr()); - } - } else if constexpr (IsSameType::value || IsSameType::value) { - enhancedParams.sidStoreMode = (uint8_t)TWO_TIMES; - if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::QF322B8_PRE || - MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { - enhancedParams.deqScale = DeqScale::DEQ8; - enhancedParams.deqValue = MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue(); - } else if (MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VQF322B8_PRE || - MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { - enhancedParams.deqScale = DeqScale::VDEQ8; - LocalTensor quantLocalTensor; - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(quantLocalTensor, curN, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - enhancedParams.deqTensorAddr = reinterpret_cast(quantLocalTensor.GetPhyAddr()); - } - } - } -}; -} -#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_out_datacopy.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H + +#include "../../matmul_module.h" +#include "../../matmul_param.h" +#include "copy_cube_out_intf.h" + +namespace Gemm { +namespace Impl { +namespace Detail { + +constexpr int DOUBLE_SPACE = 2; +constexpr int TRIPLE_SPACE = 3; +constexpr int TWO_TIMES = 2; +constexpr int EIGHT_TIMES = 8; +constexpr int SHIFT_16_BIT = 16; +constexpr int SHIFT_32_BIT = 32; +constexpr int SHIFT_48_BIT = 48; +constexpr uint32_t MAX_REPEAT_STRIDE = 255; +constexpr int PATTERN_SIZE = 8; +constexpr int PATTERN_OFFSET = 2; +/* + CopyCubeOut is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeOut is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeOut::IsNeedUB())>> +{ + using SrcType = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using SrcT = typename GetDstType::Type; + + MATMUL_USE_MODULE(CubeOutBuffer); + MATMUL_USE_MODULE(MatmulQuantProcessor); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(LocalWorkspace); + +public: + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) + { + CopyOutImpl(gm, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } + + template + __aicore__ inline void Copy(const LocalTensor& co2Local, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) + { + CopyOutImpl(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } + + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int curRow, int curCol, int32_t baseHeight, + int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth, + const ScheduleContext& context = 0) + { + CopyOutImpl(gm, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } + +private: + template + __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyOutNZ2ND(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + CopyOutNZ2NZ(dst, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } else { + ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); + } + } + + template + __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co2Local, + const LocalTensor& co1Local, int curRow, int curCol, int32_t baseHeight, + int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyOutNZ2ND(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + CopyOutNZ2NZ(dst, co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } else { + ASCENDC_ASSERT(false, { KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type."); }); + } + } + + template + __aicore__ inline void CopyOutNZ2NZ(const LocalTensor& co2Local, const LocalTensor& co1Local, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ >= MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()), { + KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be not less than baseM %d", + MATMUL_CAST_TO_IMPL()->M_, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()); + }); + + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = baseBlockWidth; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.srcStride = 0; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + if constexpr (enSequentialWrite) { + dataCopyInfo.dstStride = 0; + CopyCo12Co2WithQuant(co2Local, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, enhancedParams); + } else { + dataCopyInfo.dstStride = (Ceil(MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM(), BLOCK_CUBE) * BLOCK_CUBE - + baseBlockHeight * BLOCK_CUBE) * BLOCK_CUBE * sizeof(DstT) / + ONE_BLK_SIZE; + int dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * MATMUL_CAST_TO_IMPL()->M_; + CopyCo12Co2WithQuant(co2Local[dstOffset], co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, enhancedParams); + } + } + + template + __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); + SetFlag(eventIDMte3ToV); + WaitFlag(eventIDMte3ToV); + + LocalTensor localBuf = GetLocalBuf(); + CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + + CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + } + + template + __aicore__ inline void CopyOutNZ2NZ(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + CopyOutNZ2NZ(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + } + + template + __aicore__ inline void CopyOutNZ2ND(const LocalTensor& co2Local, const LocalTensor& co1Local, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ == 1), + { KERNEL_LOG(KERNEL_ERROR, "M_ is %d, which should be equal with 1.", MATMUL_CAST_TO_IMPL()->M_); }); + + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; + + if constexpr (enSequentialWrite) { + DataCopy(co2Local, co1Local, dataCopyInfo, enhancedParams); + } else { + int dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + DataCopy(co2Local[dstOffset], co1Local, dataCopyInfo, enhancedParams); + } + } else { + ASCENDC_ASSERT((!IsSameType::value && !IsSameType::value), + { KERNEL_LOG(KERNEL_ERROR, "Data format should be NZ if GetTensorC to UB when output is int8_t."); }); + + LocalTensor trans = GetLocalBuf(); + + CopyCo12Co2WithoutQuant(trans, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + if constexpr(enSequentialWrite) { + TransNZ2NDForDstUB(co2Local, trans, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), baseHeight, baseBlockWidth, baseBlockHeight); + } else { + uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; + int dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + TransNZ2NDForDstUB(co2Local[dstOffset], trans, Ceil(dimN, blockCount) * blockCount, baseHeight, baseBlockWidth, baseBlockHeight); + } + } + } + + template + __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) + { + event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); + SetFlag(eventIDMte3ToV); + WaitFlag(eventIDMte3ToV); + + LocalTensor localBuf = GetLocalBuf(); + CopyCo12Local(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + CopyLocal2GMNZ2NZ(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); + } else if (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ || IsSameType::value && + IsSameType::value) { + CopyLocal2GMNZ2NDOnTheFly(gm, localBuf, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } else { + CopyLocal2GMNZ2NDByVec(gm, localBuf, curRow, curCol, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } + } + } + + __aicore__ inline void CopyOutNZ2ND(const GlobalTensor& gm, const LocalTensor& co2Local, + const LocalTensor& co1Local, int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth, bool enSequentialWrite) + { + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + CopyOutNZ2ND(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, enSequentialWrite); + CopyLocal2GMNZ2NZ(gm, co2Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, enSequentialWrite); + } else if (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + LocalTensor localBuf = GetLocalBuf(); + CopyCo12Co2WithoutQuant(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth); + + if (enSequentialWrite) { + TransNZ2NDForDstUB(co2Local, localBuf, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(), baseHeight, baseBlockWidth, baseBlockHeight); + } else { + uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; + int dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + TransNZ2NDForDstUB(co2Local[dstOffset], localBuf, Ceil(dimN, blockCount) * blockCount, baseHeight, baseBlockWidth, baseBlockHeight); + } + + if constexpr (!ToMatmulConfig(MM_CFG).enVecND2NZ || + IsSameType::value && IsSameType::value) { + CopyLocal2GMNZ2NDOnTheFly(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, enSequentialWrite); + } else { + CopyLocal2GMNZ2NDByVec(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, enSequentialWrite); + } + } + } + + __aicore__ inline void CopyCo12Co2WithQuant(const LocalTensor& dst, const LocalTensor& src, + int curCol, int baseBlockHeight, int baseBlockWidth, DataCopyParams& dataCopyInfo, DataCopyEnhancedParams& enhancedParams) + { + if constexpr (IsSameType::value) { + UpdateDataCopyParamForQuant(enhancedParams, curCol); + uint64_t alignedHeight = baseBlockHeight * BLOCK_CUBE; + if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { + dataCopyInfo.blockLen = baseBlockHeight; + uint64_t addr = enhancedParams.deqTensorAddr; + for (int i = 0; i < Ceil(baseBlockWidth, TWO_TIMES); ++i) { + for (int storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { + if (baseBlockWidth % TWO_TIMES != 0 && + i == Ceil(baseBlockWidth, TWO_TIMES) - 1 && + storeMode == 1) { + continue; + } + enhancedParams.deqTensorAddr = addr + i * ONE_BLK_SIZE * ONE_BYTE_BIT_SIZE + storeMode * BLOCK_CUBE * ONE_BYTE_BIT_SIZE; + enhancedParams.sidStoreMode = (uint8_t)storeMode; + DataCopy(dst[i * ONE_BLK_SIZE * alignedHeight], + src[i * ONE_BLK_SIZE * alignedHeight + storeMode * BLOCK_CUBE * alignedHeight], + dataCopyInfo, enhancedParams); + } + } + } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { + dataCopyInfo.blockLen = baseBlockHeight; + uint64_t addr = enhancedParams.deqTensorAddr; + for (int i = 0; i < Ceil(baseBlockWidth, TWO_TIMES); ++i) { + for (int storeMode = 0; storeMode < TWO_TIMES; ++storeMode) { + if (baseBlockWidth % TWO_TIMES != 0 && + i == Ceil(baseBlockWidth, TWO_TIMES) - 1 && + storeMode == 1) { + continue; + } + enhancedParams.sidStoreMode = (uint8_t)storeMode; + DataCopy(dst[i * ONE_BLK_SIZE * alignedHeight], + src[i * ONE_BLK_SIZE * alignedHeight + storeMode * BLOCK_CUBE * alignedHeight], + dataCopyInfo, enhancedParams); + } + } + } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.dstStride = 0; + uint64_t addr = enhancedParams.deqTensorAddr; + for (int i = 0; i < baseBlockWidth; ++i) { + constexpr int DEQ_OFFSET = 128; + enhancedParams.deqTensorAddr = addr + i * DEQ_OFFSET; + DataCopy(dst[i * BLOCK_CUBE * alignedHeight], src[i * BLOCK_CUBE * alignedHeight], dataCopyInfo, enhancedParams); + } + } else { + DataCopy(dst, src, dataCopyInfo, enhancedParams); + } + } else { + DataCopy(dst, src, dataCopyInfo, enhancedParams); + } + } + + __aicore__ inline void CopyCo12Co2WithoutQuant(const LocalTensor& dst, const LocalTensor& src, int curCol, + int baseBlockHeight, int baseBlockWidth) + { + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = baseBlockWidth; + dataCopyInfo.blockLen = baseBlockHeight; + dataCopyInfo.srcStride = 0; + dataCopyInfo.dstStride = 0; + DataCopyEnhancedParams enhancedParams; + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + if constexpr (IsSameType::value) { + UpdateDataCopyParamForQuant(enhancedParams, curCol); + } + DataCopy(dst, src, dataCopyInfo, enhancedParams); + } + + __aicore__ inline void CopyCo12Local(const LocalTensor& localBuf, const LocalTensor& co1Local, int curCol, int baseBlockHeight, int baseBlockWidth) + { + DataCopyParams dataCopyInfo; + dataCopyInfo.blockCount = 1; + dataCopyInfo.blockLen = baseBlockHeight * baseBlockWidth; + DataCopyEnhancedParams enhancedParams; + if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + enhancedParams.blockMode = BlockMode::BLOCK_MODE_VECTOR; + } else { + enhancedParams.blockMode = BlockMode::BLOCK_MODE_MATRIX; + ASCENDC_ASSERT((localBuf.GetSize() >= dataCopyInfo.blockLen * CUBE_MAX_SIZE), { + KERNEL_LOG(KERNEL_ERROR, "copy len is %d, which should be less than dst size %d", + dataCopyInfo.blockLen * CUBE_MAX_SIZE, localBuf.GetSize()); + }); + } + CopyCo12Co2WithQuant(localBuf, co1Local, curCol, baseBlockHeight, baseBlockWidth, dataCopyInfo, enhancedParams); + } + + __aicore__ inline void CopyLocal2GMNZ2NZNotSeq(const GlobalTensor& gm, const LocalTensor& localBuf, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + ASCENDC_ASSERT((MATMUL_CAST_TO_IMPL()->M_ >= baseHeight), { + KERNEL_LOG(KERNEL_ERROR, "M_ is %d, baseUseM_ is %d, M_ should be no less than baseUseM_", + MATMUL_CAST_TO_IMPL()->M_, baseHeight); + }); + int64_t alignM; + int alignBaseUseM; + if constexpr (C_TYPE::format == CubeFormat::NZ) { + // nz2nz + alignM = Ceil(MATMUL_CAST_TO_IMPL()->M_, BLOCK_CUBE) * BLOCK_CUBE; + alignBaseUseM = Ceil(baseHeight, BLOCK_CUBE) * BLOCK_CUBE; + } else { + // nz2nd A is vector + alignM = MATMUL_CAST_TO_IMPL()->M_; + alignBaseUseM = baseHeight; + } + + int64_t dstOffset; + int64_t dstStride; + int blockLen; + int blockCount; + if constexpr (IsSameType::value || IsSameType::value) { + dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + + curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * ONE_BLK_SIZE; + dstStride = (alignM - alignBaseUseM) * sizeof(DstT); + blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT); + blockCount = Ceil(baseBlockWidth, TWO_TIMES); + } else { + dstOffset = curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN() * alignM + + curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * BLOCK_CUBE; + dstStride = (alignM - alignBaseUseM) * sizeof(DstT) * BLOCK_CUBE / ONE_BLK_SIZE; + blockLen = baseBlockHeight * BLOCK_CUBE * sizeof(DstT) * + BLOCK_CUBE / ONE_BLK_SIZE; + blockCount = baseBlockWidth; + } + + if (dstStride >= UINT16_MAX) { + int srcStride; + if constexpr (IsSameType::value || IsSameType::value) { + dstStride = alignM * ONE_BLK_SIZE; + srcStride= baseHeight * ONE_BLK_SIZE; + } else { + dstStride = alignM * BLOCK_CUBE; + srcStride = baseHeight * BLOCK_CUBE; + } + for (int i = 0; i < blockCount; ++i) { + DataCopy(gm[dstOffset + i * dstStride], localBuf[i * srcStride], { 1, static_cast(blockLen), 0, 0 }); + } + } else { + DataCopy(gm[dstOffset], localBuf, { static_cast(blockCount), static_cast(blockLen), 0, + static_cast(dstStride) }); + } + } + + __aicore__ inline void CopyLocal2GMNZ2NZSeq(const GlobalTensor& gm, const LocalTensor& localBuf, int baseHeight, int baseBlockWidth) + { + int blockLen = baseHeight * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE; + DataCopy(gm, localBuf, { static_cast(baseBlockWidth), + static_cast(blockLen), 0, 0 }); + } + + template + __aicore__ inline void CopyLocal2GMNZ2NZ(const GlobalTensor& gm, const LocalTensor& localBuf, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + if constexpr (enSequentialWrite) { + CopyLocal2GMNZ2NZSeq(gm, localBuf, baseHeight, baseBlockWidth); + } else { + CopyLocal2GMNZ2NZNotSeq(gm, localBuf, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth); + } + } + + __aicore__ inline void TransNZ2NDForDstUB(const LocalTensor& co2Local, const LocalTensor& trans, + int dstStride, int baseHeight, int baseBlockWidth, int baseBlockHeight) + { + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + DataCopyParams dataCopyInfo { + static_cast(baseBlockWidth), + static_cast(blockCount * sizeof(DstT) / ONE_BLK_SIZE), + static_cast((baseBlockHeight * BLOCK_CUBE * blockCount - + blockCount) * sizeof(DstT) / ONE_BLK_SIZE), + 0 + }; + for (int i = 0; i < baseHeight; i++) { + DataCopy(co2Local[i * dstStride], trans[i * blockCount], dataCopyInfo); + } + } + + template + __aicore__ inline void CopyLocal2GMNZ2NDByVec(const GlobalTensor& gm, const LocalTensor& localBuf, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + + LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset( + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()) + .template ReinterpretCast(); + int transSize = localBuf.GetSize(); + if constexpr (IsSameType::value || IsSameType::value) { + if (baseBlockWidth % TWO_TIMES != 0) { + transSize += baseBlockHeight * CUBE_MAX_SIZE; + } + } + trans.SetSize(transSize); + + int dstOffset; + int dstStride; + int offset; + bool isGmAligned; + if constexpr (enSequentialWrite) { + dstOffset = 0; + dstStride = 0; + offset = baseWidth; + isGmAligned = ((baseWidth % blockCount) == 0); + } else { + int width = baseBlockWidth * blockCount; + if constexpr (IsSameType::value || IsSameType::value) { + width = width / TWO_TIMES; + } + ASCENDC_ASSERT((dimN >= width), + { KERNEL_LOG(KERNEL_ERROR, "dimN is %d, width is %d, dimN should be no less than width", dimN, width); }); + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + isGmAligned = 1; + } else { + isGmAligned = ((dimN % blockCount) == 0 && (MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN() % blockCount) == 0); + } + + dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + dstStride = (dimN - width) * sizeof(DstT) / ONE_BLK_SIZE; + offset = dimN; + } + bool isTargetAligned = (baseWidth % blockCount) == 0; + const bool isComputeLineByLine = (!isGmAligned || dstStride >= UINT16_MAX); + + // 1 if target is not aligned, must copy the unalign data to trans UB + if constexpr (IsSameType::value) { + bool isOdd; + if constexpr (IsSameType::value || IsSameType::value) { + if (baseWidth % TWO_TIMES > 0) { + isOdd = true; + } + } else { + isOdd = false; + } + bool isSingleCore = MATMUL_CAST_TO_IMPL()->M_ <= MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreM() && dimN <= MATMUL_MODULE(MatmulShapeInfo)->GetSingleCoreN(); + bool isMutiCoreNeedPad = !isSingleCore && !isComputeLineByLine; + if (!isTargetAligned && (isSingleCore || isMutiCoreNeedPad) && !isOdd) { + PadUnalignedToTrans(trans, gm, dstOffset, isComputeLineByLine, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } + } else { + if (!isTargetAligned) { + PadUnalignedToTrans(trans, gm, dstOffset, isComputeLineByLine, baseHeight, baseWidth, + baseBlockHeight, baseBlockWidth); + } + } + + // 2. trans nz buffer to nd buffer + TransNZ2NDByVec(trans, localBuf, baseBlockHeight, baseBlockWidth, (DstT)1.0, baseHeight, baseWidth, baseBlockWidth); + + // 3. copy trans buffer to gm + int blockLen = baseBlockWidth * (blockCount * sizeof(DstT) / ONE_BLK_SIZE); + if constexpr (IsSameType::value || IsSameType::value) { + blockLen = Ceil(blockLen, TWO_TIMES); + } + if (isComputeLineByLine) { + if constexpr (IsSameType::value) { + if constexpr (!enSequentialWrite) { + dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * MATMUL_CAST_TO_IMPL()->N_ + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + offset = MATMUL_CAST_TO_IMPL()->N_; + } + int newBlockCount; + if constexpr (IsSameType::value || IsSameType::value) { + newBlockCount = BLOCK_CUBE; + } else { + newBlockCount = ONE_BLK_SIZE / sizeof(DstT); + } + if (isTargetAligned) { + CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, newBlockCount, offset); + } else { + if (blockLen == 1) { + CopyTrans2GMByVecByLineUnalignOneBlock(gm[dstOffset], trans, baseHeight, baseWidth, blockLen, newBlockCount, offset); + } else { + if constexpr (IsSameType::value || IsSameType::value) { + CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); + } else { + CopyTrans2GMByVecByLineUnalign(gm[dstOffset], trans, baseHeight, baseWidth, baseBlockHeight, baseBlockWidth, blockLen, newBlockCount, offset); + } + } + } + } else { + CopyTrans2GMByVecByLineAlign(gm[dstOffset], trans, baseHeight, blockLen, ONE_BLK_SIZE / sizeof(DstT), offset); + } + } else { + CopyTrans2GMByVecNormal(gm[dstOffset], trans, baseHeight, blockLen, dstStride); + } + } + + template + __aicore__ inline void PadUnalignedToTrans(const LocalTensor& trans, const GlobalTensor& gm, + int dstOffset, bool isComputeLineByLine, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) + { + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + int32_t alignedSize; + if constexpr (IsSameType::value || IsSameType::value) { + alignedSize = GetC0Size(); + } else { + alignedSize = BLOCK_CUBE; + } + int baseUseN = Ceil(baseWidth, alignedSize) * alignedSize; + int gmTailOffset = dstOffset + baseUseN - blockCount; + int transTailOffset = baseUseN - blockCount; + + auto enQueEvtID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); + + if (isComputeLineByLine) { + if constexpr (enSequentialWrite) { + PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, baseUseN, baseHeight); + } else { + PadUnalignedToTransByLine(trans[transTailOffset], gm[gmTailOffset], baseUseN, MATMUL_CAST_TO_IMPL()->N_, baseHeight); + } + } else { + PadUnalignedToTransWithStride(trans[transTailOffset], gm[gmTailOffset], baseHeight, baseWidth, baseBlockWidth); + } + + // if copy gm to ub, must add the set/wait flag to wait the UB has be writed; + event_t eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); + SetFlag(eventIDMte2ToV); + WaitFlag(eventIDMte2ToV); + } + + __aicore__ inline void PadUnalignedToTransByLine(const LocalTensor& trans, const GlobalTensor& gm, + int transStride, int gmStride, int baseHeight) + { + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + // copy gm to trans one line by one line + for (int i = 0; i < baseHeight; ++i) { + DataCopy(trans[i * transStride], gm[i * gmStride], { static_cast(1), + static_cast(blockCount * sizeof(DstT) / ONE_BLK_SIZE), 0, 0 }); + } + } + + __aicore__ inline void PadUnalignedToTransWithStride(const LocalTensor& trans, const GlobalTensor& gm, int baseHeight, + int baseWidth, int baseBlockWidth) + { + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + // copy gm to trans with stride + DataCopy(trans, gm, { static_cast(baseHeight), static_cast(1), + static_cast(MATMUL_CAST_TO_IMPL()->N_ / blockCount - 1), static_cast(baseWidth / blockCount) }); + } + + __aicore__ inline auto TransNZ2NDByVec(const LocalTensor& trans, const LocalTensor& localBuf, + int blockHigh, int blockWidth, DstT scalar, int baseHeight, int baseWidth, int32_t baseBlockWidth) + { + event_t eventIDMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); + SetFlag(eventIDMte3ToV); + WaitFlag(eventIDMte3ToV); + // B32's block count is 16 + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : ONE_BLK_SIZE / sizeof(DstT); + ASCENDC_ASSERT(((blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE) <= MAX_REPEAT_TIMES), { + KERNEL_LOG(KERNEL_ERROR, "blockWidth is %d, blockCount is %d, repeat time exceed max time %d", blockWidth, + blockCount, MAX_REPEAT_TIMES); + }); + if constexpr (IsSameType::value || IsSameType::value) { + struct UnaryRepeatParams intriParams; + int widthAlign = TWO_TIMES; + int offsetWidth = Ceil(blockWidth, widthAlign) * widthAlign; + intriParams.dstBlkStride = Ceil(baseWidth, ONE_BLK_SIZE); + intriParams.srcBlkStride = 1; + uint32_t dstRepStride = Ceil(baseWidth * sizeof(DstT), ONE_BLK_SIZE) * EIGHT_TIMES; + intriParams.dstRepStride = dstRepStride; + bool isBeyondMaxStride = false; + if (dstRepStride > MAX_REPEAT_STRIDE) { + isBeyondMaxStride = true; + } + intriParams.srcRepStride = (blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + int dstOffset = 0; + int srcOffset = 0; + int highBlock = MAX_REPEAT_TIMES; + int highBlocks = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES / highBlock; + int highTail = (blockHigh * BLOCK_CUBE) / EIGHT_TIMES % highBlock; + uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; + // mov src to dst width aligned + LocalTensor tmpSrc = localBuf.template ReinterpretCast(); + LocalTensor tmpDst = trans.template ReinterpretCast(); + SetVectorMask(mask[1], mask[0]); + constexpr int64_t srcOffsetStride = BLOCK_CUBE * EIGHT_TIMES; + const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES / + TWO_TIMES; + for (int i = 0; i < Ceil(blockWidth, TWO_TIMES); ++i) { + if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { + // if the baseWidth is not aligned, set the mask value; + if (i == (Ceil(blockWidth, TWO_TIMES) - 1) && (baseWidth % blockCount != 0)) { + uint64_t masktail = (1 << (Ceil(baseWidth % blockCount, TWO_TIMES))) - 1; + mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); + mask[1] = mask[0]; + SetVectorMask(mask[1], mask[0]); + } + } + int dstMulsOffset = dstOffset; + for (int j = 0; j < highBlocks; ++j) { + Muls(tmpDst[dstMulsOffset], tmpSrc[srcOffset], (int16_t)scalar, mask, + highBlock, intriParams); + srcOffset += highBlock * BLOCK_CUBE; + dstMulsOffset += blockWidth * blockCount * highBlock; + } + if (highTail) { + if (isBeyondMaxStride) { + int tmpSrcOffset = srcOffset; + for (int j = 0; j < highTail; j++) { + Muls(tmpDst[dstMulsOffset], + tmpSrc[tmpSrcOffset], (int16_t)scalar, mask, 1, intriParams); + dstMulsOffset += dstOffsetStride; + tmpSrcOffset += srcOffsetStride; + } + } else { + Muls(tmpDst[dstMulsOffset], tmpSrc[srcOffset], (int16_t)scalar, mask, + highTail, intriParams); + } + srcOffset += highTail * BLOCK_CUBE * EIGHT_TIMES; + } + dstOffset += BLOCK_CUBE; + } + } else { + struct UnaryRepeatParams intriParams; + + int dstOffset = 0; + int srcOffset = 0; + int highBlock = MAX_REPEAT_TIMES; + int highBlocks = 0; + int highTail = 0; + int32_t srcStride = highBlock * blockCount; + int32_t dstStride = blockWidth * blockCount * highBlock; + bool isBeyondMaxStride = false; + uint64_t mask[2] = {uint64_t(-1), uint64_t(-1)}; + + if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { + intriParams.dstBlkStride = 1; + intriParams.srcBlkStride = 1; + intriParams.dstRepStride = blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE; + intriParams.srcRepStride = blockCount * sizeof(DstT) / ONE_BLK_SIZE; + highBlocks = (blockHigh * blockCount) / highBlock; + highTail = (blockHigh * blockCount) % highBlock; + mask[0] = static_cast((1<< blockCount) - 1); + mask[1] = 0; + } else { + intriParams.dstBlkStride = blockWidth; + intriParams.srcBlkStride = 1; + uint32_t dstRepStride = (blockWidth * blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + intriParams.dstRepStride = dstRepStride; + if (dstRepStride > MAX_REPEAT_STRIDE) { + isBeyondMaxStride = true; + } + intriParams.srcRepStride = (blockCount * sizeof(DstT) / ONE_BLK_SIZE) * EIGHT_TIMES; + highBlocks = (blockHigh * blockCount) / EIGHT_TIMES / highBlock; + highTail = (blockHigh * blockCount) / EIGHT_TIMES % highBlock; + srcStride *= EIGHT_TIMES; + dstStride *= EIGHT_TIMES; + } + SetVectorMask(mask[1], mask[0]); + for (int i = 0; i < blockWidth; ++i) { + if constexpr (C_TYPE::format != CubeFormat::ND_ALIGN) { + // if the baseWidth is not aligned, set the mask value; + if (i == (blockWidth - 1) && (baseWidth % blockCount != 0)) { + uint64_t masktail = (1 << (baseWidth % blockCount)) - 1; + mask[0] = masktail + (masktail << SHIFT_16_BIT) + (masktail << SHIFT_32_BIT) + (masktail << SHIFT_48_BIT); + mask[1] = mask[0]; + SetVectorMask(mask[1], mask[0]); + } + } + int dstMulsOffset = dstOffset; + for (int j = 0; j < highBlocks; ++j) { + Muls(trans[dstMulsOffset], localBuf[srcOffset], scalar, mask, highBlock, intriParams); + srcOffset += srcStride; + dstMulsOffset += dstStride; + } + if (highTail) { + if (isBeyondMaxStride) { + const int64_t srcOffsetStride = blockCount * EIGHT_TIMES; + const int64_t dstOffsetStride = baseBlockWidth * BLOCK_CUBE * EIGHT_TIMES; + for (int j = 0; j < highTail; j++) { + Muls(trans[dstMulsOffset + j * dstOffsetStride], + localBuf[srcOffset + j * srcOffsetStride], scalar, mask, 1, intriParams); + } + } else { + Muls(trans[dstMulsOffset], localBuf[srcOffset], scalar, mask, highTail, intriParams); + } + if constexpr (sizeof(DstT) == B32_BYTE_SIZE) { + srcOffset += highTail * blockCount; + } else { + srcOffset += highTail * blockCount * EIGHT_TIMES; + } + } + dstOffset += blockCount; + } + } + event_t eventIDVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMte3); + WaitFlag(eventIDVToMte3); + } + + __aicore__ inline void CopyTrans2GMByVecByLineAlign(const GlobalTensor& gm, const LocalTensor& trans, int baseHeight, + int blockLen, int blockCount, int offset) + { + for (int i = 0; i < baseHeight; ++i) { + DataCopy(gm[i * offset], trans[i * blockLen * blockCount], + { 1, static_cast(blockLen), 0, 0 }); + PipeBarrier(); + } + } + + __aicore__ inline void CopyTrans2GMByVecByLineUnalignOneBlock(const GlobalTensor& gm, const LocalTensor& trans, + int baseHeight, int baseWidth, int blockLen, int blockCount, int offset) + { + auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIDVToS); + WaitFlag(eventIDVToS); + int padLen = (ONE_BLK_SIZE - baseWidth * sizeof(DstT)) / sizeof(DstT); + SetAtomicAdd(); + for (int i = 0; i < baseHeight; ++i) { + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0) + .template ReinterpretCast(); + int transIndex = i * blockLen * blockCount; + for (int j = 0; j < baseWidth; ++j) { + transAligin.SetValue(j, trans.GetValue(transIndex + j)); + } + for (int j = baseWidth; j < blockCount; ++j) { + transAligin.SetValue(j, 0); + } + DataCopy(gm[i * offset], transAligin, { 1, 1, 0, 0 }); + auto eventIDMTE3ToS = GetTPipePtr()->FetchEventID(HardEvent::MTE3_S); + SetFlag(eventIDMTE3ToS); + WaitFlag(eventIDMTE3ToS); + } + SetAtomicNone(); + } + + template + __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, + int baseHeight, int baseWidth, int baseBlockHeight, int baseBlockWidth, int blockLen, int blockCount, int offset) -> enable_if_t + { + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0).template ReinterpretCast(); + int remainLen = (baseWidth % blockCount) / TWO_TIMES; + auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIDVToS); + WaitFlag(eventIDVToS); + LocalTensor src1Pattern; + src1Pattern = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset( + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() / TWO_TIMES).template ReinterpretCast(); + LocalTensor tmpSrc = trans.template ReinterpretCast(); + src1Pattern.SetSize(PATTERN_SIZE); + src1Pattern.SetValue(0, 0xFFFF << remainLen); + src1Pattern.SetValue(1, (1 << remainLen) - 1); + for (int i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { + src1Pattern.SetValue(i, 0); + } + int orinRemain = baseWidth % blockCount; + int gmOffset = blockCount * (blockLen - PATTERN_OFFSET); + for (int i = 0; i < baseHeight; ++i) { + DataCopy(gm[i * offset], trans[i * blockLen * blockCount], + { 1, static_cast(blockLen - 1), 0, 0 }); + if (baseWidth % TWO_TIMES == 0) { + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); + GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); + uint64_t rsvdCnt = 0; + GatherMask(transAligin, tmpSrc[((i + 1) * blockLen - PATTERN_OFFSET) * BLOCK_CUBE], src1Pattern, + false, 0, gatherMaskParams, rsvdCnt); + LocalTensor tmpTrans = transAligin.template ReinterpretCast(); + DataCopy(gm[i * offset + gmOffset + remainLen * DOUBLE_SPACE], tmpTrans, { 1, 1, 0, 0 }); + PipeBarrier(); + } else { + auto eventIDMTE3ToS = GetTPipePtr()->FetchEventID(HardEvent::MTE3_S); + SetFlag(eventIDMTE3ToS); + WaitFlag(eventIDMTE3ToS); + LocalTensor tmpTrans = transAligin.template ReinterpretCast(); + for (int j = 0; j < ONE_BLK_SIZE; ++j) { + tmpTrans.SetValue(j, trans[((i + 1) * blockLen - PATTERN_OFFSET) * blockCount + orinRemain].GetValue(j)); + } + auto eventIDSToMTE3 = GetTPipePtr()->FetchEventID(HardEvent::S_MTE3); + SetFlag(eventIDSToMTE3); + WaitFlag(eventIDSToMTE3); + DataCopy(gm[i * offset + gmOffset + orinRemain], tmpTrans, { 1, 1, 0, 0 }); + PipeBarrier(); + } + } + } + + template + __aicore__ inline auto CopyTrans2GMByVecByLineUnalign(const GlobalTensor& gm, const LocalTensor& trans, + int baseHeight, int baseWidth, int baseBlockHeight, int baseBlockWidth, int blockLen, int blockCount, int offset) -> enable_if_t + { + LocalTensor transAligin = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset(0).template ReinterpretCast(); + int remainLen = baseWidth % blockCount; + auto eventIDVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIDVToS); + WaitFlag(eventIDVToS); + LocalTensor src1Pattern; + src1Pattern = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset( + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength() / TWO_TIMES).template ReinterpretCast(); + src1Pattern.SetSize(PATTERN_SIZE); + src1Pattern.SetValue(0, 0xFFFF << remainLen); + src1Pattern.SetValue(1, (1 << remainLen) - 1); + for (int i = PATTERN_OFFSET; i < PATTERN_SIZE; ++i) { + src1Pattern.SetValue(i, 0); + } + int gmOffset = blockCount * (blockLen - PATTERN_OFFSET); + for (int i = 0; i < baseHeight; ++i) { + DataCopy(gm[i * offset], trans[i * blockLen * blockCount], + { 1, static_cast(blockLen - 1), 0, 0 }); + GatherMaskParams gatherMaskParams(1, 1, PATTERN_SIZE, PATTERN_SIZE); + uint64_t rsvdCnt = 0; + auto enQueEvtID = GetTPipePtr()->FetchEventID(HardEvent::MTE3_V); + SetFlag(enQueEvtID); + WaitFlag(enQueEvtID); + GatherMask(transAligin, trans[((i + 1) * blockLen - PATTERN_OFFSET) * blockCount], + src1Pattern, false, 0, gatherMaskParams, rsvdCnt); + DataCopy(gm[i * offset + gmOffset + remainLen], transAligin, { 1, 1, 0, 0 }); + PipeBarrier(); + } + } + + __aicore__ inline void CopyTrans2GMByVecNormal(const GlobalTensor& gm, const LocalTensor& trans, + int baseHeight, int blockLen, int dstStride) + { + DataCopy(gm, trans, { static_cast(baseHeight), static_cast(blockLen), 0, + static_cast(dstStride) }); + } + + template + __aicore__ inline void CopyLocal2GMNZ2NDOnTheFly(const GlobalTensor& gm, const LocalTensor& localBuf, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, + int32_t baseBlockHeight, int32_t baseBlockWidth) + { + uint32_t dimN = (MATMUL_CAST_TO_IMPL()->Kc_ != 0) ? MATMUL_CAST_TO_IMPL()->Kc_ : MATMUL_CAST_TO_IMPL()->N_; + constexpr int oneBlockCount = ONE_BLK_SIZE / sizeof(DstT); + constexpr int blockCount = sizeof(DstT) == B32_BYTE_SIZE ? BLOCK_CUBE : oneBlockCount; + int calcWidth = baseWidth / blockCount; + int dstOffset = curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * dimN + curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN(); + int blockLen = blockCount * sizeof(DstT) / ONE_BLK_SIZE; + int srcRepeatGap = (baseBlockHeight * BLOCK_CUBE * blockCount - blockCount) * sizeof(DstT) / ONE_BLK_SIZE; + int tail = baseWidth % blockCount; + LocalTensor trans = MATMUL_MODULE(LocalWorkspace)->GetNZ2NDWorkspace().template ReinterpretCast(); + trans.SetSize(blockCount); + + int offset = dimN; + if constexpr (enSequentialWrite) { + dstOffset = 0; + offset = baseWidth; + } + + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + offset = Ceil(offset, blockCount) * blockCount; + calcWidth = baseBlockWidth; + tail = 0; + } + + // Allocate MTE2_MTE3 eventId: eventIDMte3ToMte2 + event_t eventIDMte3ToMte2 = static_cast(GetTPipePtr()->AllocEventID()); + + for (int i = 0; i < baseHeight; i++) { + if (calcWidth > 0) { + DataCopy(gm[dstOffset + i * offset], localBuf[i * blockCount], + { static_cast(calcWidth), static_cast(blockLen), + static_cast(srcRepeatGap), 0 }); + if constexpr (IsSameType::value && + IsSameType::value) { + PipeBarrier(); + } + } + + if (tail != 0) { + int SrcTypeailOffset = i * blockCount + calcWidth * blockCount * Ceil(baseHeight, blockCount) * blockCount; + if (baseWidth * sizeof(DstT) > ONE_BLK_SIZE) { + int dstTailOffset = dstOffset + i * offset + calcWidth * blockCount; + int basicOffset = 0; + if (sizeof(DstT) == B32_BYTE_SIZE) { + DataCopy(gm[dstTailOffset], localBuf[SrcTypeailOffset], { 1, 1, 0, 0 }); + basicOffset = oneBlockCount; + } + + // reg_mov + SrcTypeailOffset = SrcTypeailOffset + basicOffset - + blockCount * Ceil(baseHeight, blockCount) * blockCount + baseWidth % blockCount; + dstTailOffset = dstTailOffset + basicOffset + baseWidth % blockCount - blockCount; + if constexpr (IsSameType::value && + IsSameType::value) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); + SetFlag(eventID); + WaitFlag(eventID); + } + int j = 0; + for (int i = 0; i < blockCount - baseWidth % blockCount; j++, i++) { + DstT scalar = localBuf.GetValue(SrcTypeailOffset + i); + trans.SetValue(j, scalar); + } + SrcTypeailOffset = i * blockCount + calcWidth * blockCount * Ceil(baseHeight, blockCount) * blockCount; + for (int i = 0; i < baseWidth % blockCount; j++, i++) { + DstT scalar = localBuf.GetValue(SrcTypeailOffset + i); + trans.SetValue(j, scalar); + } + + event_t eventIDSToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3)); + SetFlag(eventIDSToMte3); + WaitFlag(eventIDSToMte3); + // copy the tail from ub to gm + DataCopy(gm[dstTailOffset], trans, { 1, 1, 0, 0 }); + if constexpr (IsSameType::value && + IsSameType::value) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_S)); + SetFlag(eventID); + WaitFlag(eventID); + } + } else { + if (i > 0) { + WaitFlag(eventIDMte3ToMte2); + } + if constexpr (IsSameType::value && + IsSameType::value) { + event_t eventID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE2)); + SetFlag(eventID); + WaitFlag(eventID); + } + DataCopy(trans, gm[dstOffset + i * offset + baseWidth], { 1, 1, 0, 0 }); + event_t eventIDMte2ToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); + SetFlag(eventIDMte2ToMte3); + WaitFlag(eventIDMte2ToMte3); + DataCopy(gm[dstOffset + i * offset], localBuf[SrcTypeailOffset], { 1, 1, 0, 0 }); + PipeBarrier(); + DataCopy(gm[dstOffset + i * offset + baseWidth], trans, { 1, 1, 0, 0 }); + if (i < baseHeight - 1) { + SetFlag(eventIDMte3ToMte2); + } + } + } + } + event_t eventID = static_cast(GetTPipePtr()->AllocEventID()); + SetFlag(eventID); + WaitFlag(eventID); + // Release MTE2_MTE3 eventId: eventIDMte3ToMte2 + GetTPipePtr()->ReleaseEventID(eventIDMte3ToMte2); + } + + __aicore__ inline LocalTensor GetLocalBuf() + { + LocalTensor localBuf = MATMUL_MODULE(LocalWorkspace)->GetCopy2Co2Workspace().template ReinterpretCast(); + localBuf.SetSize(MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + return localBuf; + } + + __aicore__ inline void UpdateDataCopyParamForQuant(DataCopyEnhancedParams& enhancedParams, int curCol) + { + if constexpr (IsSameType::value) { + if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::DEQF16) { + enhancedParams.deqScale = DeqScale::DEQ16; + enhancedParams.deqValue = MATMUL_MODULE(MatmulQuantProcessor)->GetQuantScalarValue(); + } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VDEQF16) { + enhancedParams.deqScale = DeqScale::VDEQ16; + LocalTensor quantLocalTensor; + MATMUL_MODULE(MatmulQuantProcessor)->CopyQuantTensor(quantLocalTensor, curCol, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + enhancedParams.deqTensorAddr = reinterpret_cast(quantLocalTensor.GetPhyAddr()); + } + } else if constexpr (IsSameType::value || IsSameType::value) { + enhancedParams.sidStoreMode = (uint8_t)TWO_TIMES; + if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::QF322B8_PRE || + MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::REQ8) { + enhancedParams.deqScale = DeqScale::DEQ8; + enhancedParams.deqValue = MATMUL_MODULE(MatmulQuantProcessor)->GetQuantScalarValue(); + } else if (MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VQF322B8_PRE || + MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode() == QuantMode_t::VREQ8) { + enhancedParams.deqScale = DeqScale::VDEQ8; + LocalTensor quantLocalTensor; + MATMUL_MODULE(MatmulQuantProcessor)->CopyQuantTensor(quantLocalTensor, curCol, MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); + enhancedParams.deqTensorAddr = reinterpret_cast(quantLocalTensor.GetPhyAddr()); + } + } + } +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h index 4ed5a0f6e3e756784af644fd15695c9b8f1db7bb..7f1aadba8aac6ebea2d63a158ce2a034e1b3cdc0 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h @@ -19,227 +19,194 @@ #include "../../matmul_module.h" #include "../../matmul_param.h" #include "../../resource/cube_out_buffer/cube_out_buffer.h" -#include "../../matmul_var.h" -#include "copy_cube_intf.h" +#include "copy_cube_out_intf.h" #include "../../feature_trait/matmul_feature_trait.h" +#include "../quant/quant_processor_utils.h" #include "copy_cube_out_utils.h" -#include "copy_cube_data_warp.h" - -namespace matmul { - -constexpr static uint8_t FIX_PIPE_UNIT_FLAG = 3; +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CopyCubeOut is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeOut is only for internal usage, does not support extension or customized specialization! +*/ template -class CopyCubeOut +class CopyCubeOut::IsNeedUB())>> { - using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; - using FixpipeType = FixpipeParamsUtil::GetFixpipeParamsType()>; + using SrcT = typename GetDstType::Type; + using FixpipeAdaptor = FixpipeParamsUtil::GetFixpipeParamsType()>; - MATMUL_USE_MODULE(QuantProcessor); - MATMUL_USE_MODULE(CubeOutBuffer); - MATMUL_USE_MODULE(MatmulVarC); - MATMUL_USE_MODULE(MatmulShapeInfoC); MATMUL_USE_MODULE(Context); - MATMUL_USE_MODULE(DataWarp); + MATMUL_USE_MODULE(MatmulQuantProcessor); + MATMUL_USE_MODULE(MatmulShapeInfo); + MATMUL_USE_MODULE(MatmulShapeTiling); public: __aicore__ inline CopyCubeOut() = default; - __aicore__ inline void CopyOut(const GlobalTensor& gm, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) { if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { - if (!MATMUL_CAST_TO_IMPL()->intraBlockMatmul.fakeMsg) { - CopyOutImpl, true>(gm, co1Local, curM, curN, enSequentialWrite); + if (!MATMUL_CONST_INTRA_BLOCK.fakeMsg) { + CopyOutImpl, true>(gm, co1Local, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth); return; } } - CopyOutImpl, false>(gm, co1Local, curM, curN, enSequentialWrite); + CopyOutImpl, false>(gm, co1Local, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth); } - __aicore__ inline void CopyOut(const LocalTensor& co2Local, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) + template + __aicore__ inline void Copy(const LocalTensor& co2Local, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) { - CopyOutImpl(co2Local, co1Local, curM, curN, enSequentialWrite); + CopyOutImpl(co2Local, co1Local, curRow, curCol, baseHeight, baseWidth, baseBlockHeight, + baseBlockWidth); } private: - template - __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, - int curM, int curN, bool enSequentialWrite) + template + __aicore__ inline void CopyOutImpl(const T& dst, const LocalTensor& co1Local, + int curRow, int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, int32_t baseBlockWidth) { if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyOutNZ2ND(dst, co1Local, curM, curN, enSequentialWrite); + CopyOutNZ2ND(dst, co1Local, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth); } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - CopyOutNZ2NZ(dst, co1Local, curM, curN, enSequentialWrite); + CopyOutNZ2NZ(dst, co1Local, curRow, curCol, baseHeight, + baseWidth, baseBlockHeight, baseBlockWidth); } else { - ASCENDC_ASSERT(false, {KERNEL_LOG(KERNEL_ERROR, "CopyOut: unsupport Matmul format type.");}); + ASCENDC_ASSERT(false, {KERNEL_LOG(KERNEL_ERROR, "Copy: unsupport Matmul format type.");}); } } - template - __aicore__ inline void CopyOutNZ2ND(const T& dst, const LocalTensor& co1Local, int curM, int curN, bool enSequentialWrite) + template + __aicore__ inline void CopyOutNZ2ND(const T& dst, const LocalTensor& co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) { - auto stride = MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseWidth(); + auto stride = baseWidth; int64_t dstOffset = 0; - if (!enSequentialWrite) { + if constexpr (!enSequentialWrite) { stride = GetOrgWidth(); - dstOffset = static_cast(static_cast(curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * stride)) + - static_cast(curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth()); - } - FixpipeType fixpipeParams(MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseWidth(), - MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseHeight(), - MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfoC)->template GetBlockNumAlongWidth(), - stride); - if constexpr (EnUnitFlag(MM_CFG)) { - fixpipeParams.params.unitFlag = FIX_PIPE_UNIT_FLAG; + dstOffset = static_cast(static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * stride) + + static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()); } - CopyNDTensor(dst[dstOffset], co1Local, fixpipeParams.params, curN, MATMUL_MODULE(MatmulShapeInfoC)->GetBaseUseWidth()); + FixpipeAdaptor fixpipe(baseWidth, + baseHeight, + baseBlockWidth, + baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + stride); + + CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); } - template - __aicore__ inline void CopyOutNZ2NZ(const T& dst, const LocalTensor& co1Local, int curM, int curN, bool enSequentialWrite) + template + __aicore__ inline void CopyOutNZ2NZ(const T& dst, const LocalTensor& co1Local, int curRow, int curCol, + int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth) { int64_t dstOffset = 0; uint32_t stride = 0; - uint32_t burstLen = 0; -#if __CCE_AICORE__ == 220 - burstLen = static_cast(MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseHeight() * - BLOCK_CUBE * sizeof(L0cT) / ONE_BLK_SIZE) * sizeof(DstT) / sizeof(L0cT); -#endif - if (!enSequentialWrite) { - dstOffset = curN * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseWidth() - * MATMUL_MODULE(DataWarp)-> template GetM() - + curM * MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight() * BLOCK_CUBE; - stride = static_cast((MATMUL_MODULE(DataWarp)-> template GetM() - MATMUL_MODULE(MatmulShapeInfoC)-> template GetBaseUseHeight()) * - BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE) + burstLen; + if constexpr (!enSequentialWrite) { + dstOffset = static_cast(curCol * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseN()) + * GetOrgM() + + static_cast(curRow * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM()) * BLOCK_CUBE; + stride = static_cast((GetOrgM() - baseHeight) * + BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); } else { - stride = static_cast((MATMUL_MODULE(MatmulShapeInfoC)->template GetBlockNumAlongHeight() * BLOCK_CUBE - - MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseHeight()) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE) + burstLen; + stride = static_cast((baseBlockHeight * BLOCK_CUBE - baseHeight) * BLOCK_CUBE * sizeof(DstT) / ONE_BLK_SIZE); } - FixpipeType fixpipeParams(MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseWidth(), - MATMUL_MODULE(MatmulShapeInfoC)->template GetBaseUseHeight(), - MATMUL_MODULE(MatmulShapeInfoC)->GetBaseHeight(), - MATMUL_MODULE(MatmulShapeInfoC)->template GetBlockNumAlongWidth(), - stride); - if constexpr (EnUnitFlag(MM_CFG)) { - fixpipeParams.params.unitFlag = FIX_PIPE_UNIT_FLAG; - } - CopyNZTensor(dst[dstOffset], co1Local, fixpipeParams.params, curN, MATMUL_MODULE(MatmulShapeInfoC)-> template GetBaseUseWidth()); + FixpipeAdaptor fixpipe(baseWidth, + baseHeight, + baseBlockWidth, + baseBlockHeight, + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetBaseM(), + stride); + CopyTensor(dst[dstOffset], co1Local, fixpipe, curCol, baseWidth); } template - __aicore__ inline void CopyNDTensor(const T& dst, const LocalTensor& co1Local, - FixpipeParamsV220& fixpipeParams, const int32_t curN = 0, const int32_t baseUseN = 0) + __aicore__ inline void CopyTensor(const T& dst, const LocalTensor& co1Local, + FixpipeAdaptor& fixpipe, const int32_t curN = 0, const int32_t baseUseN = 0) { - if constexpr (IsSetQuant()) { - fixpipeParams.quantPre = MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode(); - LocalTensor l1TmpForQuant; - if (MATMUL_MODULE(QuantProcessor)->IsPerChannelSenario()) { - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(l1TmpForQuant, curN, baseUseN); - Fixpipe(dst, co1Local, l1TmpForQuant, fixpipeParams); - MATMUL_MODULE(QuantProcessor)->FreeTmpQuantTensor(l1TmpForQuant); + fixpipe.SetCastMode(); + if constexpr (IsQuantSenario()) { + fixpipe.SetQuantMode(MATMUL_MODULE(MatmulQuantProcessor)->GetMatmulQuantMode()); + LocalTensor quantTensor; + if (MATMUL_MODULE(MatmulQuantProcessor)->IsPerChannelSenario()) { + MATMUL_MODULE(MatmulQuantProcessor)->CopyQuantTensor(quantTensor, curN, baseUseN); + fixpipe.template FixpipeOut(dst, co1Local, quantTensor); + MATMUL_MODULE(MatmulQuantProcessor)->FreeQuantTensor(quantTensor); } else { - fixpipeParams.deqScalar = MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue(); - Fixpipe(dst, co1Local, fixpipeParams); + fixpipe.SetQuantScalar(MATMUL_MODULE(MatmulQuantProcessor)->GetQuantScalarValue()); + fixpipe.template FixpipeOut(dst, co1Local); } } else { - Fixpipe(dst, co1Local, fixpipeParams); + fixpipe.template FixpipeOut(dst, co1Local); } } - template - __aicore__ inline void CopyNDTensor(const T& dst, const LocalTensor& co1Local, - FixpipeParams& fixpipeParams, const int32_t curN = 0, const int32_t baseUseN = 0) + template + __aicore__ inline uint32_t GetOrgWidth() { - if constexpr (IsSetQuant()) { - LocalTensor l1TmpForQuant; - if (MATMUL_MODULE(QuantProcessor)->IsPerChannelSenario()) { - fixpipeParams.quantParams = { MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode(), - MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue() }; - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(l1TmpForQuant, curN, baseUseN); - Fixpipe(dst, co1Local, l1TmpForQuant, fixpipeParams); - MATMUL_MODULE(QuantProcessor)->FreeTmpQuantTensor(l1TmpForQuant); - } else { - fixpipeParams.quantParams = { MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() }; - Fixpipe(dst, co1Local, fixpipeParams); - } - } else { - Fixpipe(dst, co1Local, fixpipeParams); + uint32_t dimN = GetOrgN(); + if (GetOrgKc() != 0) { + dimN = GetOrgKc(); } - } - - template - __aicore__ inline void CopyNZTensor(const T& dst, const LocalTensor& co1Local, - FixpipeParamsV220& fixpipeParams, const int32_t curN = 0, const int32_t baseUseN = 0) - { - if constexpr (IsSetQuant()) { - fixpipeParams.quantPre = MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode(); - LocalTensor l1TmpForQuant; - if (MATMUL_MODULE(QuantProcessor)->IsPerChannelSenario()) { - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(l1TmpForQuant, curN, baseUseN); - Fixpipe(dst, co1Local, l1TmpForQuant, fixpipeParams); - MATMUL_MODULE(QuantProcessor)->FreeTmpQuantTensor(l1TmpForQuant); - } else { - fixpipeParams.deqScalar = MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue(); - Fixpipe(dst, co1Local, fixpipeParams); - } - } else { - Fixpipe(dst, co1Local, fixpipeParams); + constexpr uint32_t blockCount = ONE_BLK_SIZE / sizeof(DstT); + if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { + dimN = Ceil(dimN, blockCount) * blockCount; } + return dimN; } - template - __aicore__ inline void CopyNZTensor(const T& dst, const LocalTensor& co1Local, - FixpipeParams& fixpipeParams, const int32_t curN = 0, const int32_t baseUseN = 0) + template + __aicore__ inline uint32_t GetOrgKc() { - if constexpr (IsSetQuant()) { - LocalTensor l1TmpForQuant; - if (MATMUL_MODULE(QuantProcessor)->IsPerChannelSenario()) { - fixpipeParams.quantParams = { MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode(), - MATMUL_MODULE(QuantProcessor)->GetQuantScalarValue() }; - MATMUL_MODULE(QuantProcessor)->CopyQuantTensor(l1TmpForQuant, curN, baseUseN); - Fixpipe(dst, co1Local, l1TmpForQuant, fixpipeParams); - MATMUL_MODULE(QuantProcessor)->FreeTmpQuantTensor(l1TmpForQuant); - } else { - fixpipeParams.quantParams = { MATMUL_MODULE(QuantProcessor)->GetMatmulQuantMode() }; - Fixpipe(dst, co1Local, fixpipeParams); - } + if constexpr ((C_TYPE::layout == LayoutMode::SBNGD) || (C_TYPE::layout == LayoutMode::BSNGD)) { + return 0; } else { - Fixpipe(dst, co1Local, fixpipeParams); + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgKc(); } } - __aicore__ inline constexpr static bool IsSetQuant() + template + __aicore__ inline uint32_t GetOrgM() { - if constexpr (IsSameType::value && IsSameType::value) { - return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { - return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { - return true; + if constexpr (C_TYPE::layout == LayoutMode::SBNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoB() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoS1(); + } else if constexpr (C_TYPE::layout == LayoutMode::BSNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoS1(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgM(); } - return false; } template - __aicore__ inline uint32_t GetOrgWidth() + __aicore__ inline uint32_t GetOrgN() { - uint32_t dimN = MATMUL_MODULE(DataWarp)->template GetN(); - if (MATMUL_MODULE(DataWarp)->template GetK() != 0) { - dimN = MATMUL_MODULE(DataWarp)-> template GetK(); - } - constexpr uint32_t blockCount = ONE_BLK_SIZE / sizeof(DstT); - if constexpr (C_TYPE::format == CubeFormat::ND_ALIGN) { - dimN = Ceil(dimN, blockCount) * blockCount; + if constexpr (C_TYPE::layout == LayoutMode::SBNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoG() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoS2() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoN() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoB(); + } else if constexpr (C_TYPE::layout == LayoutMode::BSNGD) { + return MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoG() * MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoS2() * + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetCLayoutInfoN(); + } else { + return MATMUL_MODULE(MatmulShapeInfo)->template GetOrgN(); } - return dimN; } }; -} -#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_FIXPIPE_H \ No newline at end of file +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_FIXPIPE_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h new file mode 100644 index 0000000000000000000000000000000000000000..9ebbddc12906d95b5bd4588170b9b85c6ffd3ca1 --- /dev/null +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h @@ -0,0 +1,76 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_out_intf.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H + +#include "../../feature_trait/matmul_chip_cap.h" +namespace Gemm { +namespace Impl { +namespace Detail { +/* + CopyCubeOut is considered entirely experimental. + We retain the freedom to make incompatible changes, but do not guarantee the stability. + CopyCubeOut is only for internal usage, does not support extension or customized specialization! +*/ +template +class CopyCubeOut +{ + using DstT = typename C_TYPE::T; + using SrcT = typename GetDstType::Type; +public: + + /** + * @description: Copy data from L0C to LocalTensor + * @param: co2Local: The Copy dst address + * @param: co1Local: The L0C address while the matmul result store + * @param: curRow: The current handled block of the matrixA index + * @param: curCol: The current handled block of the matrixA index + * @param: baseHeight: The current handled block of the matrixA tiles to be loaded this time + * @param: baseHeight: The current handled block of the matrixB tiles to be loaded this time + * @param: baseBlockHeight: The current block number of the matrixA tiles + * @param: baseBlockWidth: The current handled block number of the matrixB tiles + * @param: enSequentialWrite: The data's write type on dst address, continue or flat write + * @return: void + */ + template + __aicore__ inline void Copy(const GlobalTensor& gm, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) + {} + + /** + * @description: Copy data from L0C to LocalTensor + * @param: co2Local: The Copy dst address + * @param: co1Local: The L0C address while the matmul result store + * @param: curRow: The current handled block of the matrixA index + * @param: curCol: The current handled block of the matrixA index + * @param: baseHeight: The current handled block of the matrixA tiles to be loaded this time + * @param: baseHeight: The current handled block of the matrixB tiles to be loaded this time + * @param: baseBlockHeight: The current block number of the matrixA tiles + * @param: baseBlockWidth: The current handled block number of the matrixB tiles + * @param: enSequentialWrite: The data's write type on dst address, continue or flat write + * @return: void + */ + template + __aicore__ inline void Copy(const LocalTensor& co2Local, const LocalTensor& co1Local, int curRow, + int curCol, int32_t baseHeight, int32_t baseWidth, int32_t baseBlockHeight, + int32_t baseBlockWidth, const ScheduleContext& context = 0) + {} +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h index 9d67ed45e68357dfbc62ce331417f7f7ba6c3f49..ea84dfe8123701635241c91533dbb0c6a9abd198 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h @@ -1,115 +1,127 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file copy_cube_out_utils.h - * \brief - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H -#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H - -namespace matmul { - -template -struct FixpipeParamsUtil { - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using TYPE = FixpipeParamsV220; - -public: - __aicore__ inline ~FixpipeParamsUtil() = default; - - __aicore__ inline FixpipeParamsUtil(int32_t iterateBaseWidth, int32_t iterateBaseHeight, - int32_t baseHeight, int32_t baseBlockN, int32_t stride) {} - -public: - TYPE params; -}; - -template -class FixpipeParamsUtil -{ - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using TYPE = FixpipeParamsV220; - -public: - __aicore__ inline ~FixpipeParamsUtil() = default; - - __aicore__ inline FixpipeParamsUtil(int32_t iterateBaseWidth, int32_t iterateBaseHeight, - int32_t baseHeight, int32_t baseBlockN, int32_t stride) - { - if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - params.nSize = static_cast(iterateBaseWidth); - } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - params.nSize = static_cast(baseBlockN * BLOCK_CUBE); - } - params.mSize = static_cast(iterateBaseHeight); - params.srcStride = Align((IsStaticPaddingEnable(MM_CFG) ? baseHeight : iterateBaseHeight), BLOCK_CUBE); - params.dstStride = stride; - SetFixpipeQuantPre(); - } - -public: - TYPE params; - -private: - __aicore__ inline constexpr void SetFixpipeQuantPre() - { -#if __CCE_AICORE__ >= 220 - if constexpr (IsSameType::value && !IsSameType::value) { - params.quantPre = QuantMode_t::F322F16; - } else if constexpr (IsSameType::value && !IsSameType::value) { - params.quantPre = QuantMode_t::F322BF16; - } -#endif - } -}; - -template -class FixpipeParamsUtil -{ - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using L0cT = typename GetDstType::Type; - using TYPE = FixpipeParams; - -public: - __aicore__ inline ~FixpipeParamsUtil() = default; - __aicore__ inline FixpipeParamsUtil(int32_t iterateBaseWidth, int32_t iterateBaseHeight, - int32_t baseHeight, int32_t baseBlockN, int32_t stride) - { - params.cburstNum = baseBlockN; - params.burstLen = static_cast(iterateBaseHeight * BLOCK_CUBE * sizeof(L0cT) / ONE_BLK_SIZE); - params.srcStride = stride; - if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - params.nz2ndParams = { true, 1, 0, 0, static_cast(iterateBaseWidth) }; - } - SetFixpipeQuantPre(); - } - -public: - TYPE params; - -private: - __aicore__ inline constexpr void SetFixpipeQuantPre() - { -#if __CCE_AICORE__ >= 220 - if constexpr (IsSameType::value && !IsSameType::value) { - params.quantParams = { QuantMode_t::F322F16 }; - } else if constexpr (IsSameType::value && !IsSameType::value) { - params.quantParams = { QuantMode_t::F322BF16 }; - } -#endif - } -}; -} +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file copy_cube_out_utils.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H +#define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H + +namespace Gemm { +namespace Impl { +namespace Detail { + +const static uint8_t FIX_PIPE_UNIT_FLAG = 3; + +template +struct FixpipeParamsUtil { + using DstT = typename C_TYPE::T; + using SrcT = typename GetDstType::Type; + using TYPE = FixpipeParamsV220; + +public: + __aicore__ inline ~FixpipeParamsUtil() = default; + + __aicore__ inline FixpipeParamsUtil(int32_t nSize, int32_t mSize, + int32_t nSizeBlock, int32_t mSizeBlock, int32_t baseHeight, int32_t dstStride) + {} + + __aicore__ inline void SetQuantMode(QuantMode_t quantMode) {} + + __aicore__ inline void SetQuantScalar(uint64_t scalar) {} + + template + __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal, + const LocalTensor& quantTensor) {} + + template + __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal) {} + +public: + TYPE params_; +}; + + +template +struct FixpipeParamsUtil +{ + using DstT = typename C_TYPE::T; + using SrcT = typename GetDstType::Type; + using TYPE = FixpipeParamsV220; + +public: + __aicore__ inline ~FixpipeParamsUtil() = default; + + __aicore__ inline FixpipeParamsUtil(int32_t nSize, int32_t mSize, + int32_t nSizeBlock, int32_t mSizeBlock, int32_t baseHeight, int32_t dstStride) + { + if constexpr(C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + params_.nSize = static_cast(nSize); + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + params_.nSize = static_cast(nSizeBlock * BLOCK_CUBE); + dstStride = dstStride + static_cast(mSize * BLOCK_CUBE * sizeof(SrcT) / ONE_BLK_SIZE) * + sizeof(DstT) / sizeof(SrcT); + } + params_.mSize = static_cast(mSize); + params_.srcStride = CeilAlign((IsStaticPaddingEnable(MM_CFG) ? baseHeight : mSize), BLOCK_CUBE); + params_.dstStride = dstStride; + if constexpr(EnUnitFlag(MM_CFG)) { + params_.unitFlag = FIX_PIPE_UNIT_FLAG; + } + } + + __aicore__ inline void SetQuantMode(QuantMode_t quantMode) + { + params_.quantPre = quantMode; + } + + __aicore__ inline void SetQuantScalar(uint64_t scalar) + { + params_.deqScalar = scalar; + } + + template + __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal, const LocalTensor& quantTensor) + { + if constexpr (C_TYPE::format == CubeFormat::NZ) { + Fixpipe(dst, colLocal, quantTensor, params_); + } else { + Fixpipe(dst, colLocal, quantTensor, params_); + } + } + + template + __aicore__ inline void FixpipeOut(const T& dst, const LocalTensor& colLocal) + { + if constexpr (C_TYPE::format == CubeFormat::NZ) { + Fixpipe(dst, colLocal, params_); + } else { + Fixpipe(dst, colLocal, params_); + } + } + + __aicore__ inline constexpr void SetCastMode() + { + if constexpr (IsSameType::value && IsSameType::value) { + params_.quantPre = QuantMode_t::F322F16; + } else if constexpr (IsSameType::value && IsSameType::value) { + params_.quantPre = QuantMode_t::F322BF16; + } + } + +public: + TYPE params_; +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h index 9319aa3ea1b70829eaaef9882b3763e149e37376..99ee489f8d80ada987958ac53ad6fe54212864d8 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h +++ b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h @@ -20,19 +20,22 @@ #include "../../matmul_param.h" #include "quant_processor_intf.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template -class QuantProcessor +class MatmulQuantProcessor::IsNeedUB())>> { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; - MATMUL_USE_MODULE(MatmulVarC); + MATMUL_USE_MODULE(MatmulShapeTiling); + MATMUL_USE_MODULE(LocalWorkspace); public: - __aicore__ inline QuantProcessor() {} - __aicore__ inline ~QuantProcessor() {} + __aicore__ inline MatmulQuantProcessor() {} + __aicore__ inline ~MatmulQuantProcessor() {} __aicore__ inline void Init(const int32_t baseN) {} @@ -75,10 +78,10 @@ public: } } - __aicore__ inline void CopyQuantTensor(LocalTensor& tempQuantTensor, + __aicore__ inline void CopyQuantTensor(LocalTensor& quantTensor, const int32_t curN, const int32_t baseUseN) { - CopyQuantTensorImpl(tempQuantTensor, curN, baseUseN); + CopyQuantTensorImpl(quantTensor, curN, baseUseN); } __aicore__ inline uint64_t GetQuantScalarValue() @@ -95,39 +98,23 @@ public: __aicore__ inline bool IsPerTensorSenario() {} - __aicore__ inline void FreeTmpQuantTensor(LocalTensor& tmpQuantTensor) {} + __aicore__ inline void FreeQuantTensor(LocalTensor& tmpQuantTensor) {} __aicore__ inline void Destory() {} private: - __aicore__ inline void CopyQuantTensorImpl(LocalTensor& tempQuantTensor, + __aicore__ inline void CopyQuantTensorImpl(LocalTensor& quantTensor, const int32_t curN, const int32_t baseUseN) { - if constexpr (C_TYPE::format == CubeFormat::NZ) { - tempQuantTensor = MATMUL_MODULE(MatmulVarC) - ->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength()] - .template ReinterpretCast(); - } else if constexpr (ToMatmulConfig(MM_CFG).enVecND2NZ) { - if constexpr (!ToMatmulConfig(MM_CFG).enableUBReuse) { - constexpr int tripleSpace = 3; - tempQuantTensor = MATMUL_MODULE(MatmulVarC) - ->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength() * tripleSpace] - .template ReinterpretCast(); - } else { - tempQuantTensor = MATMUL_MODULE(MatmulVarC) - ->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetTransLength()] - .template ReinterpretCast(); - } - } else { - tempQuantTensor = MATMUL_MODULE(MatmulVarC) - ->GetLocalWorkspace()[MATMUL_MODULE(MatmulVarC)->GetND2NZOffset()] - .template ReinterpretCast(); - } - tempQuantTensor.SetSize(baseUseN); + quantTensor = MATMUL_MODULE(LocalWorkspace)->template + GetWorkspaceWithOffset( + MATMUL_MODULE(MatmulShapeTiling)->GetTiling().GetTransLength()) + .template ReinterpretCast(); + quantTensor.SetSize(baseUseN); auto enQueEvtID = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); SetFlag(enQueEvtID); WaitFlag(enQueEvtID); - DataCopy(tempQuantTensor, quantTensor_[curN * baseUseN], baseUseN); + DataCopy(quantTensor, quantTensor_[curN * baseUseN], baseUseN); auto eventIDMte2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); SetFlag(eventIDMte2ToV); WaitFlag(eventIDMte2ToV); @@ -138,6 +125,7 @@ private: GlobalTensor quantTensor_; uint64_t quantScalar_ = 0; }; -} - +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_DATACOPY_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h index 0bed95c2407be9f0dcb8f299cac9d73d840ed085..562ad50dde83217eb1ede72aa1e9aed0fe67b661 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h +++ b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h @@ -19,28 +19,30 @@ #include "../../matmul_module.h" #include "../../matmul_param.h" #include "quant_processor_intf.h" +#include "quant_processor_utils.h" -namespace matmul { +namespace Gemm { +namespace Impl { +namespace Detail { template -class QuantProcessor +class MatmulQuantProcessor::Type, typename C_TYPE::T>() + && !MatmulFeatureTrait::IsNeedUB())>> { using SrcT = typename A_TYPE::T; using DstT = typename C_TYPE::T; using L0cT = typename GetDstType::Type; public: - __aicore__ inline QuantProcessor() {} - __aicore__ inline ~QuantProcessor() {} + __aicore__ inline MatmulQuantProcessor() {} + __aicore__ inline ~MatmulQuantProcessor() {} __aicore__ inline void Init(const int32_t baseN) { - if constexpr (isQuantSenario_) { - baseN_ = baseN; - isPerChannel_ = false; - isPerTensor_ = false; - MATMUL_PARAM_VAR.tpipe_->InitBuffer(qidFixPipe_, 1, baseN_ * sizeof(int64_t)); - } + baseN_ = baseN; + isPerChannel_ = false; + isPerTensor_ = false; + GetTPipePtr()->InitBuffer(qidFixPipe_, 1, baseN_ * sizeof(int64_t)); } __aicore__ inline QuantMode_t GetMatmulQuantMode() @@ -86,20 +88,18 @@ public: } } - __aicore__ inline void CopyQuantTensor(LocalTensor& tempQuantTensor, + __aicore__ inline void CopyQuantTensor(LocalTensor& quantTensor, const int32_t curN, const int32_t baseUseN) { - if constexpr (isQuantSenario_) { - if (isPerChannel_) { - tempQuantTensor = qidFixPipe_.template AllocTensor(); - if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - CopyDeqTensorToL1(tempQuantTensor, quantTensor_[curN * baseN_], baseUseN); - } else { - CopyDeqTensorToL1(tempQuantTensor, quantTensor_[curN * baseN_], baseUseN * BLOCK_CUBE); - } - qidFixPipe_.EnQue(tempQuantTensor); - qidFixPipe_.DeQue(); + if (isPerChannel_) { + quantTensor = qidFixPipe_.template AllocTensor(); + if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + CopyDeqTensorToL1(quantTensor, quantTensor_[curN * baseN_], baseUseN); + } else { + CopyDeqTensorToL1(quantTensor, quantTensor_[curN * baseN_], baseUseN * BLOCK_CUBE); } + qidFixPipe_.EnQue(quantTensor); + qidFixPipe_.DeQue(); } } @@ -115,44 +115,19 @@ public: __aicore__ inline bool IsPerChannelSenario() { - return isQuantSenario_ && isPerChannel_; - } - - __aicore__ inline bool IsPerTensorSenario() - { - return isQuantSenario_ && isPerTensor_; + return isPerChannel_; } - __aicore__ inline void FreeTmpQuantTensor(LocalTensor& tmpQuantTensor) + __aicore__ inline void FreeQuantTensor(LocalTensor& quantTensor) { - if constexpr (!isQuantSenario_) { - return; - } else if (isPerChannel_) { - qidFixPipe_.FreeTensor(tmpQuantTensor); + if (isPerChannel_) { + qidFixPipe_.FreeTensor(quantTensor); } } __aicore__ inline void Destory() { - if constexpr (!isQuantSenario_) { - return; - } else { - qidFixPipe_.FreeAllEvent(); - } - } - - __aicore__ inline static constexpr bool GetQuantSenario() - { - if constexpr (IsSameType::value && IsSameType::value) { - return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { - return true; - } else if constexpr (IsSameType::value && - (IsSameType::value || IsSameType::value)) { - return true; - } - return false; + qidFixPipe_.FreeAllEvent(); } private: @@ -185,11 +160,12 @@ private: bool isPerTensor_ = false; bool isPerChannel_ = false; QuantMode_t quantMode_ = QuantMode_t::NoQuant; - static constexpr bool isQuantSenario_ = GetQuantSenario(); TQue qidFixPipe_; GlobalTensor quantTensor_; uint64_t quantScalar_ = 0; int32_t baseN_ = 0; }; -} +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_FIXPIPE_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_intf.h b/impl/matmul/modules/stage/quant/quant_processor_intf.h index 10fb06d027dd982e9781dcb3ef6b95c4546c0ae6..852f32769ce5bcdc78f25f33cf9095196469595c 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_intf.h +++ b/impl/matmul/modules/stage/quant/quant_processor_intf.h @@ -1,100 +1,103 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -/*! - * \file quant_processor_intf.h - * \brief - */ - -#ifndef IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H -#define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H - -#include "../../feature_trait/matmul_chip_cap.h" -namespace matmul { - -template -class QuantProcessor -{ -public: - __aicore__ inline QuantProcessor() {} - __aicore__ inline ~QuantProcessor() {} - - /** - * @description: Init QuantProcessor and quant params buf - * @param: baseN: the quant param len for base block cal - * @return: void - */ - __aicore__ inline void Init(const int32_t baseN) {} - - /** - * @description: Set quant Scalar mode and Scalar params - * @param: scalar params - * @return: void - */ - __aicore__ inline void SetQuantScalar(const uint64_t scalar) {} - - /** - * @description: Set quant VectorMode and vector tensor params - * @param: vector tensor params - * @return: void - */ - __aicore__ inline void SetQuantVector(const GlobalTensor& tensor) {} - - /** - * @description: Get the quant mode - * @return: quant mode - */ - __aicore__ inline QuantMode_t GetMatmulQuantMode() {} - - /** - * @description: Get the Scalar value - * @return: Scalar Value - */ - __aicore__ inline uint64_t GetQuantScalarValue() {} - - /** - * @description: Free quant param buf - * @param: tempQuantTensor: The quant params store buf for datacopy/fixpipe interface - * @param: curNIdx: The quant param block index - * @param: baseUseN: The quant param block size - * @return: void - */ - __aicore__ inline void CopyQuantTensor(LocalTensor& tempQuantTensor, - const int32_t curN, const int32_t baseUseN) {} - - /** - * @description: Update quantTensor by idx - * @param: idx: The offset in quantTensor - * @return: void - */ - __aicore__ inline void UpdateQuantTensor(int32_t idx) {} - - /** - * @description: Get the flag to district scalar or vector quant mode - * @return: void - */ - __aicore__ inline bool IsPerChannelSenario() {} - - /** - * @description: Free quant param buf for datacopy/fixpipe interface - * @param: tempQuantTensor: The quant params store buf for datacopy/fixpipe - * @return: void - */ - __aicore__ inline void FreeTmpQuantTensor(LocalTensor& tempQuantTensor) {} - - /** - * @description: Free quant param buf && free event - * @return: void - */ - __aicore__ inline void Destory() {} -}; -} - +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file quant_processor_intf.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H +#define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H + +#include "../../feature_trait/matmul_chip_cap.h" +namespace Gemm { +namespace Impl { +namespace Detail { + +template +class MatmulQuantProcessor +{ +public: + __aicore__ inline MatmulQuantProcessor() {} + __aicore__ inline ~MatmulQuantProcessor() {} + + /** + * @description: Init MatmulQuantProcessor and quant params buf + * @param: baseN: the quant param len for base block cal + * @return: void + */ + __aicore__ inline void Init(const int32_t baseN) {} + + /** + * @description: Set quant Scalar mode and Scalar params + * @param: scalar params + * @return: void + */ + __aicore__ inline void SetQuantScalar(const uint64_t scalar) {} + + /** + * @description: Set quant VectorMode and vector tensor params + * @param: vector tensor params + * @return: void + */ + __aicore__ inline void SetQuantVector(const GlobalTensor& tensor) {} + + /** + * @description: Get the quant mode + * @return: quant mode + */ + __aicore__ inline QuantMode_t GetMatmulQuantMode() {} + + /** + * @description: Get the Scalar value + * @return: Scalar Value + */ + __aicore__ inline uint64_t GetQuantScalarValue() {} + + /** + * @description: Free quant param buf + * @param: tempQuantTensor: The quant params store buf for datacopy/fixpipe interface + * @param: curNIdx: The quant param block index + * @param: baseUseN: The quant param block size + * @return: void + */ + __aicore__ inline void CopyQuantTensor(LocalTensor& quantTensor, + const int32_t curN, const int32_t baseUseN) {} + + /** + * @description: Update quantTensor by idx + * @param: idx: The offset in quantTensor + * @return: void + */ + __aicore__ inline void UpdateQuantTensor(int32_t idx) {} + + /** + * @description: Get the flag to district scalar or vector quant mode + * @return: void + */ + __aicore__ inline bool IsPerChannelSenario() {} + + /** + * @description: Free quant param buf for datacopy/fixpipe interface + * @param: tempQuantTensor: The quant params store buf for datacopy/fixpipe + * @return: void + */ + __aicore__ inline void FreeQuantTensor(LocalTensor& quantTensor) {} + + /** + * @description: Free quant param buf && free event + * @return: void + */ + __aicore__ inline void Destory() {} +}; +} // namespace Detail +} // namespace Impl +} // namespace Gemm #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_utils.h b/impl/matmul/modules/stage/quant/quant_processor_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..5324a1aebe56c3fdbaabfa4f27fd4b865c1a8a2c --- /dev/null +++ b/impl/matmul/modules/stage/quant/quant_processor_utils.h @@ -0,0 +1,40 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file quant_processor_utils.h + * \brief + */ + +#ifndef IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H +#define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H + +namespace Gemm { +namespace Impl { +namespace Detail { + +template +__aicore__ inline constexpr static bool IsQuantSenario() +{ + if constexpr (IsSameType::value && IsSameType::value) { + return true; + } else if constexpr (IsSameType::value && + (IsSameType::value || IsSameType::value)) { + return true; + } else if constexpr (IsSameType::value && + (IsSameType::value || IsSameType::value)) { + return true; + } + return false; +} +} // namespace Detail +} // namespace Impl +} // namespace Gemm +#endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H \ No newline at end of file diff --git a/lib/matmul/constant_tiling.h b/lib/matmul/constant_tiling.h index fbfe06f46acd5f95e629db7c0be237ed2ce47ff5..4ddf535bc886f75a65dbaacc69c775a1d3927ba9 100644 --- a/lib/matmul/constant_tiling.h +++ b/lib/matmul/constant_tiling.h @@ -17,7 +17,7 @@ #include "../../impl/matmul/matmul_constant_tiling_impl.h" -namespace matmul { +namespace Gemm { template __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = L1_SIZE) { diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 2ccc7050b53add069ab0975885341f55a5f60420..57c44d746a2ab598f3e173b6c5fe17b6826e19cb 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -20,7 +20,7 @@ #include "lib/matmul/constant_tiling.h" #include "../../impl/matmul/matmul_call_back.h" -namespace matmul { +namespace Gemm { using namespace AscendC; template @@ -113,6 +113,8 @@ public: __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {} }; -} // namespace matmul +} // namespace Gemm +// Compatible with the previously used matmul namespace +namespace matmul = Gemm; #include "../../impl/matmul/matmul_impl.h" #endif \ No newline at end of file diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 8665bf049ae7bc32cf5891358194bc5eabcce209..6f5f2e8b13cabcd00512b7378b6de057a7b94d44 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -26,7 +26,7 @@ #include "../../impl/matmul/matmul_server.h" #endif -namespace matmul { +namespace Gemm { using namespace AscendC; constexpr int32_t VECTOR_QUANT_MODE = 2; @@ -775,7 +775,7 @@ public: #if ASCENDC_CPU_DEBUG public: // this is useless code just for cpu debug - typename matmul::MatmulInstAux, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulImpl; +using Matmul = MatmulImpl; } template __aicore__ static T* GetCurTiling(T* t, Args&&... b) { @@ -61,10 +61,10 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) InitCurObj(tpipe, __VA_ARGS__) #else -namespace matmul { +namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulClient; +using Matmul = MatmulClient; } #define REGIST_MATMUL_OBJ(tpipe, workspace, ...) \ if ASCEND_IS_AIC { \ @@ -113,10 +113,10 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) #define REGIST_MATMUL_OBJ(tpipe, workspace, ...) \ InitCurObj(tpipe, __VA_ARGS__) -namespace matmul { +namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulImpl; +using Matmul = MatmulImpl; } #endif @@ -163,10 +163,10 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) #endif #define REGIST_MATMUL_OBJ_REMOTE(tpipe, workspace, ...) -namespace matmul { +namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulImpl; +using Matmul = MatmulImpl; } #else #ifdef ASCENDC_TIME_STAMP_ON @@ -197,10 +197,10 @@ using Matmul = matmul::MatmulImpl, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulServiceAux; +using Matmul = MatmulServiceAux; } #endif @@ -226,10 +226,10 @@ using Matmul = matmul::MatmulServiceAux, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulClient; +using Matmul = MatmulClient; } #else @@ -268,12 +268,11 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) #define REGIST_MATMUL_OBJ(tpipe, workspace, ...) \ InitCurObj(tpipe, __VA_ARGS__) #endif -namespace matmul { +namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> -using Matmul = matmul::MatmulImpl; -} //namespace matmul +using Matmul = MatmulImpl; +} //namespace Gemm #endif #endif - #endif \ No newline at end of file diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp index a9fa6cd9afd0d9bf8114a3e1f3da4a8ef46d6c6e..24dbf7438ad27c54c74eec6c9a302839454c0c49 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template @@ -82,14 +82,14 @@ private: int32_t cacheProc_ = 0; }; -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: using CubeInBufferA = CustomCubeInBuffer, MM_CFG>; using CubeInBufferB = CustomCubeInBuffer, MM_CFG>; - using CopyCubeInA = CopyCubeIn, MM_CFG>; - using CopyCubeInB = CopyCubeIn, MM_CFG>; + using CopyCubeInA = Impl::Detail::CopyCubeIn, MM_CFG>; + using CopyCubeInB = Impl::Detail::CopyCubeIn, MM_CFG>; }; template ::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; using IMPL = MatmulImpl; template @@ -145,16 +135,11 @@ public: template using CopyCubeInParams = typename AscendC::Conditional::type; template - using MatmulVar = typename AscendC::Conditional::type; - template using MatmulTensorInfo = typename AscendC::Conditional::type; - template - using MatmulShapeInfo = - typename AscendC::Conditional::type; - template - using MatmulShapeTiling = typename AscendC::Conditional::type; + template using DataCopyUtils = typename AscendC::Conditional::type; + using CallBack = MM_CB; MATMUL_USE_MODULE(CopyCubeInA); @@ -184,9 +169,9 @@ public: void RunCase(int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, bool isTranspose = false) { MATMUL_MODULE(CopyCubeInA)->Init(); GlobalTensor fakeInput; - MATMUL_MODULE(CopyCubeInA)->SetInput(fakeInput.address_, isTranspose); + MATMUL_MODULE(CopyCubeInA)->SetInput(fakeInput, isTranspose); MATMUL_MODULE(CopyCubeInA)->Reset(); - auto tensor = MATMUL_MODULE(CopyCubeInA)->LoadData(curRow, curCol, tileHeight, tileWidth); + auto tensor = MATMUL_MODULE(CopyCubeInA)->template LoadData(curRow, curCol, tileHeight, tileWidth); MATMUL_MODULE(CopyCubeInA)->ClearLoadData(EMPTY_TENSOR, 0, 0); MATMUL_MODULE(CopyCubeInA)->Destroy(); } @@ -238,15 +223,15 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using A_TYPE_TRANS = matmul::MatmulType; - using A_TYPE_VECTOR = matmul::MatmulType; - using A_TYPE_NZ = matmul::MatmulType; - using A_TYPE_INT8 = matmul::MatmulType; - using A_TYPE_UB = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using A_TYPE_TRANS = MatmulType; + using A_TYPE_VECTOR = MatmulType; + using A_TYPE_NZ = MatmulType; + using A_TYPE_INT8 = MatmulType; + using A_TYPE_UB = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl, CustomMatmulPolicy> mm; MatmulImpl, CustomMatmulPolicy> mm2; @@ -319,4 +304,4 @@ TEST_F(TestCopyCubeInMDL, Copy_ND_TRANS_From_UB) { mm4.InitVar(tiling); mm4.SetRuntimeParams(32, 64, true); mm4.RunCase(0, 0, 32, 64); -} \ No newline at end of file +} diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp index 3f75759d2350b8c795b36d061bee0e89b19d8736..89fde654d4aad4e860136c2d4bc53a287f7cbd61 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template @@ -82,14 +82,14 @@ private: int32_t cacheProc_ = 0; }; -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: using CubeInBufferA = CustomCubeInBuffer, MM_CFG>; using CubeInBufferB = CustomCubeInBuffer, MM_CFG>; - using CopyCubeInA = CopyCubeIn, MM_CFG>; - using CopyCubeInB = CopyCubeIn, MM_CFG>; + using CopyCubeInA = Impl::Detail::CopyCubeIn, MM_CFG>; + using CopyCubeInB = Impl::Detail::CopyCubeIn, MM_CFG>; }; template ::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; using IMPL = MatmulImpl; template @@ -252,21 +252,21 @@ protected: void TearDown() {} private: - using A_TYPE_ND = matmul::MatmulType; - using A_TYPE_NZ = matmul::MatmulType; - using A_TYPE_VECTOR = matmul::MatmulType; + using A_TYPE_ND = MatmulType; + using A_TYPE_NZ = MatmulType; + using A_TYPE_VECTOR = MatmulType; - using A_TYPE_UB_ND = matmul::MatmulType; - using A_TYPE_UB_NZ = matmul::MatmulType; - using A_TYPE_UB_VECTOR = matmul::MatmulType; + using A_TYPE_UB_ND = MatmulType; + using A_TYPE_UB_NZ = MatmulType; + using A_TYPE_UB_VECTOR = MatmulType; - using A_TYPE_INT8_NOTRANS = matmul::MatmulType; - using B_TYPE_INT8_NOTRANS = matmul::MatmulType; - using B_TYPE_INT8_TRANS = matmul::MatmulType; + using A_TYPE_INT8_NOTRANS = MatmulType; + using B_TYPE_INT8_NOTRANS = MatmulType; + using B_TYPE_INT8_TRANS = MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; // Copy_GM_ND_Half_NoTrans MatmulImpl, CustomMatmulPolicy> mm; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp index da6e5170891f4a06cd08b223d866a69fbcb6c549..15babe5d188eb020695cef589f1b5a74b1c59091 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template @@ -82,14 +82,14 @@ private: int32_t cacheProc_ = 0; }; -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: using CubeInBufferA = CustomCubeInBuffer, MM_CFG>; using CubeInBufferB = CustomCubeInBuffer, MM_CFG>; - using CopyCubeInA = CopyCubeIn, MM_CFG>; - using CopyCubeInB = CopyCubeIn, MM_CFG>; + using CopyCubeInA = Impl::Detail::CopyCubeIn, MM_CFG>; + using CopyCubeInB = Impl::Detail::CopyCubeIn, MM_CFG>; }; template ::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; using IMPL = MatmulImpl; template @@ -145,14 +135,8 @@ public: template using CopyCubeInParams = typename AscendC::Conditional::type; template - using MatmulVar = typename AscendC::Conditional::type; - template using MatmulTensorInfo = typename AscendC::Conditional::type; - template - using MatmulShapeInfo = - typename AscendC::Conditional::type; - template - using MatmulShapeTiling = typename AscendC::Conditional::type; + template using DataCopyUtils = typename AscendC::Conditional::type; using CallBack = MM_CB; @@ -184,9 +168,9 @@ public: void RunCase(int32_t curRow, int32_t curCol, int32_t tileHeight, int32_t tileWidth, bool isTranspose = false) { MATMUL_MODULE(CopyCubeInA)->Init(); GlobalTensor fakeInput; - MATMUL_MODULE(CopyCubeInA)->SetInput(fakeInput.address_, isTranspose); + MATMUL_MODULE(CopyCubeInA)->SetInput(fakeInput, isTranspose); MATMUL_MODULE(CopyCubeInA)->Reset(); - auto tensor = MATMUL_MODULE(CopyCubeInA)->LoadData(curRow, curCol, tileHeight, tileWidth); + auto tensor = MATMUL_MODULE(CopyCubeInA)->template LoadData(curRow, curCol, tileHeight, tileWidth); MATMUL_MODULE(CopyCubeInA)->ClearLoadData(EMPTY_TENSOR, 0, 0); MATMUL_MODULE(CopyCubeInA)->Destroy(); } @@ -238,16 +222,16 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using A_TYPE_TRANS = matmul::MatmulType; - using A_TYPE_VECTOR = matmul::MatmulType; - using A_TYPE_NZ = matmul::MatmulType; - using A_TYPE_INT8 = matmul::MatmulType; - using A_TYPE_UB = matmul::MatmulType; - using A_TYPE_L1 = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using A_TYPE_TRANS = MatmulType; + using A_TYPE_VECTOR = MatmulType; + using A_TYPE_NZ = MatmulType; + using A_TYPE_INT8 = MatmulType; + using A_TYPE_UB = MatmulType; + using A_TYPE_L1 = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl, CustomMatmulPolicy> mm; MatmulImpl, CustomMatmulPolicy> mm2; @@ -329,4 +313,4 @@ TEST_F(TestCopyCubeInNorm, Copy_ND_From_L1) { mm7.InitVar(tiling); mm7.SetRuntimeParams(32, 64, true); mm7.RunCase(0, 0, 32, 64); -} \ No newline at end of file +} diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp index 41337a4ea07d8d54a465996d8bbd839cc6a5353e..5700500a7fac5ff50c1d95103e545d96bad18f5b 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template @@ -82,14 +82,14 @@ private: int32_t cacheProc_ = 0; }; -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: using CubeInBufferA = CustomCubeInBuffer, MM_CFG>; using CubeInBufferB = CustomCubeInBuffer, MM_CFG>; - using CopyCubeInA = CopyCubeIn, MM_CFG>; - using CopyCubeInB = CopyCubeIn, MM_CFG>; + using CopyCubeInA = Impl::Detail::CopyCubeIn, MM_CFG>; + using CopyCubeInB = Impl::Detail::CopyCubeIn, MM_CFG>; }; template ::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; using IMPL = MatmulImpl; template @@ -252,21 +252,21 @@ protected: void TearDown() {} private: - using A_TYPE_ND = matmul::MatmulType; - using A_TYPE_NZ = matmul::MatmulType; - using A_TYPE_VECTOR = matmul::MatmulType; + using A_TYPE_ND = MatmulType; + using A_TYPE_NZ = MatmulType; + using A_TYPE_VECTOR = MatmulType; - using A_TYPE_UB_ND = matmul::MatmulType; - using A_TYPE_UB_NZ = matmul::MatmulType; - using A_TYPE_UB_VECTOR = matmul::MatmulType; + using A_TYPE_UB_ND = MatmulType; + using A_TYPE_UB_NZ = MatmulType; + using A_TYPE_UB_VECTOR = MatmulType; - using A_TYPE_INT8_NOTRANS = matmul::MatmulType; - using B_TYPE_INT8_NOTRANS = matmul::MatmulType; - using B_TYPE_INT8_TRANS = matmul::MatmulType; + using A_TYPE_INT8_NOTRANS = MatmulType; + using B_TYPE_INT8_NOTRANS = MatmulType; + using B_TYPE_INT8_TRANS = MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; // Copy_GM_ND_Half_NoTrans MatmulImpl, CustomMatmulPolicy> mm; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp index 81fd0ab11f862f24860554db215f16385721fc67..3fd79c7af63d1547c7c27040760adb7b7c44896a 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp @@ -16,29 +16,22 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template class MatmulImpl: MATMUL_IMPORT_MODULE_PRIVATE(CopyCubeInParamsB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfoA) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfoB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfo) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) { MATMUL_ALLOW_USING_PRIVATE(CopyCubeInParamsB); - MATMUL_ALLOW_USING_PRIVATE(MatmulVarB); - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfoA); - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); public: using VAR_PARAMS = - typename MatmulParams::PARAMS; - template - using MatmulVar = typename AscendC::Conditional::type; - template - using MatmulShapeInfo = - typename AscendC::Conditional::type; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() {} @@ -95,10 +88,10 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; }; @@ -116,25 +109,15 @@ TEST_F(TestCopyCubeInParams, all_interface) { EXPECT_EQ(mm.GetStepCol(), 2); EXPECT_EQ(mm.GetStepRow(), 3); EXPECT_EQ(mm.GetBufferPos(), stepKbIdx); - EXPECT_EQ(mm.GetOrgHeightAlign(), 256); - EXPECT_EQ(mm.GetOrgWidthAlign(), 64); - EXPECT_EQ(mm.GetBaseHeightAlign(), 96); - EXPECT_EQ(mm.GetBaseWidthAlign(), 64); - EXPECT_EQ(mm.GetSingleHeightAlign(), 256); - EXPECT_EQ(mm.GetSingleWidthAlign(), 64); - EXPECT_EQ(mm.GetSingleSize(), 48*256); - EXPECT_EQ(mm.GetSingleSizeAlign(), 64*256); - EXPECT_EQ(mm.GetCopyHeight(1), mm.GetVar().tailStepN_); - EXPECT_EQ(mm.GetCopyWidth(1, baseUseN), mm.GetVar().tailStepN_); - EXPECT_EQ(mm.GetStaticTileHeight(), 96 * 3); - EXPECT_EQ(mm.template GetStaticTileHeight(), 96); - EXPECT_EQ(mm.GetStaticTileWidth(), 96); - EXPECT_EQ(mm.template GetStaticTileWidth(), 96 * 3); + EXPECT_EQ(mm.IsKRowDirec(), true); + EXPECT_EQ(mm.GetOrgHeight(), 256); + EXPECT_EQ(mm.GetOrgWidth(), 64); + EXPECT_EQ(mm.GetBaseHeight(), 96); + EXPECT_EQ(mm.GetBaseWidth(), 48); + EXPECT_EQ(mm.GetSingleHeight(), 256); + EXPECT_EQ(mm.GetSingleWidth(), 48); + EXPECT_EQ(mm.GetTotalRow(), 3); + EXPECT_EQ(mm.GetTotalCol(), 1); EXPECT_EQ(mm.GetBufferSize(), 96 * 64); - EXPECT_TRUE(mm.IsL1KFullLoad()); - EXPECT_TRUE(mm.IsBufferPosEnd(1)); - EXPECT_FALSE(mm.IsBufferPosEnd()); - EXPECT_TRUE(mm.IsBufferKPosEnd(1)); - EXPECT_FALSE(mm.IsBufferKPosEnd()); - -} \ No newline at end of file + EXPECT_EQ(mm.GetDepth(), 4); +} diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp index db747bf0f920f115d68f882a78572465aab17a21..b68149e29b7df20826b245e304142b93c8200b0e 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp @@ -14,24 +14,25 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: - using CubeInBufferA = matmul::CubeInBuffer, MM_CFG>; - using CubeInBufferB = matmul::CubeInBuffer, MM_CFG>; + using CubeInBufferA = Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = Impl::Detail::CubeInBuffer, MM_CFG>; }; template class MatmulImpl : MATMUL_IMPORT_MODULE(CubeInBufferA) -, MATMUL_IMPORT_MODULE_PRIVATE(CubeInBufferParamsA) { +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +{ MATMUL_ALLOW_USING(CubeInBufferA); - MATMUL_ALLOW_USING_PRIVATE(CubeInBufferParamsA); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); public: using CubeInBufferA::Init; @@ -41,16 +42,12 @@ public: using CubeInBufferA::Hit; using CubeInBufferA::GetBuffer; using CubeInBufferA::Reset; - using CubeInBufferA::GetIterIndex; using CubeInBufferA::EnQue; using CubeInBufferA::DeQue; public: using VAR_PARAMS = - typename MatmulParams::PARAMS; - template - using CubeInBufferParams = - typename AscendC::Conditional::type; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() { InitVar(); } @@ -90,25 +87,22 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; }; -TEST_F(test_cube_in_buffer_double_buffer, get_iter_index) { +TEST_F(test_cube_in_buffer_double_buffer, DISABLED_get_iter_index) { mm.SetInitParams(2, 2, 32, 32); int32_t mIter = 2; int32_t kIter = 3; mm.Init(1024, 4); - ASSERT_EQ(mm.GetIterIndex(0, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 2), 0); } -TEST_F(test_cube_in_buffer_double_buffer, tiling_set_single_que) { +TEST_F(test_cube_in_buffer_double_buffer, DISABLED_tiling_set_single_que) { mm.SetInitParams(2, 2, 32, 32); int32_t mIter = 2; int32_t kIter = 2; @@ -120,7 +114,7 @@ TEST_F(test_cube_in_buffer_double_buffer, tiling_set_single_que) { for (int32_t m = 0; m < mIter; m=m+2) { for (int32_t n = 0; n < nIter; n++) { for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, k); + int32_t iterIndex = 0; if (mm.Hit(iterIndex, stepMIdx)) { fakeTensor = mm.GetBuffer(iterIndex, stepMIdx); hitCnt++; @@ -139,7 +133,7 @@ TEST_F(test_cube_in_buffer_double_buffer, tiling_set_single_que) { ASSERT_EQ(hitCnt, 2); } -TEST_F(test_cube_in_buffer_double_buffer, tiling_set_db_que) { +TEST_F(test_cube_in_buffer_double_buffer, DISABLED_tiling_set_db_que) { mm.SetInitParams(2, 2, 32, 32); int32_t mIter = 2; int32_t kIter = 2; @@ -151,7 +145,7 @@ TEST_F(test_cube_in_buffer_double_buffer, tiling_set_db_que) { for (int32_t m = 0; m < mIter; m=m+2) { for (int32_t n = 0; n < nIter; n++) { for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, k); + int32_t iterIndex = 0; if (mm.Hit(iterIndex, stepMIdx)) { fakeTensor = mm.GetBuffer(iterIndex, stepMIdx); hitCnt++; @@ -168,4 +162,4 @@ TEST_F(test_cube_in_buffer_double_buffer, tiling_set_db_que) { } mm.Destroy(); ASSERT_EQ(hitCnt, 2); -} \ No newline at end of file +} diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp index c8e7374ca78951aec04ffad7e2e3d8343922ce85..1b3ca1209f25e2d53f43e50c7b41387c3c6ed77e 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp @@ -14,15 +14,15 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: - using CubeInBufferA = matmul::CubeInBuffer, MM_CFG>; - using CubeInBufferB = matmul::CubeInBuffer, MM_CFG>; + using CubeInBufferA = Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = Impl::Detail::CubeInBuffer, MM_CFG>; }; constexpr MatmulConfig CFG_IBSHARE_NORM_DB = GetIBShareNormConfig(false, false, false, BatchMode::NONE, true); @@ -30,9 +30,10 @@ template class MatmulImpl : MATMUL_IMPORT_MODULE(CubeInBufferA) -, MATMUL_IMPORT_MODULE_PRIVATE(CubeInBufferParamsA) { +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +{ MATMUL_ALLOW_USING(CubeInBufferA); - MATMUL_ALLOW_USING_PRIVATE(CubeInBufferParamsA); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); public: using CubeInBufferA::Init; @@ -42,17 +43,13 @@ public: using CubeInBufferA::Hit; using CubeInBufferA::GetBuffer; using CubeInBufferA::Reset; - using CubeInBufferA::GetIterIndex; using CubeInBufferA::EnQue; using CubeInBufferA::DeQue; - using CubeInBufferA::SetOrgAddr; + using CubeInBufferA::SetOrgTensor; public: using VAR_PARAMS = - typename MatmulParams::PARAMS; - template - using CubeInBufferParams = - typename AscendC::Conditional::type; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() { InitVar(); } @@ -93,26 +90,23 @@ protected: void TearDown() {} private: - using A_TYPE_IBSHARE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE_IBSHARE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; - GlobalCache gCache; + Impl::Detail::GlobalCache gCache; }; -TEST_F(test_cube_in_buffer_double_global_buffer, get_iter_index) { +TEST_F(test_cube_in_buffer_double_global_buffer, DISABLED_get_iter_index) { int32_t mIter = 2; int32_t kIter = 3; mm.SetInitParams(2, 2, 32, 32, kIter); mm.Init(1024, 4); - ASSERT_EQ(mm.GetIterIndex(0, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 2), 2); } -TEST_F(test_cube_in_buffer_double_global_buffer, all_interface_normal) +TEST_F(test_cube_in_buffer_double_global_buffer, DISABLED_all_interface_normal) { int32_t mIter = 2; int32_t kIter = 2; @@ -121,12 +115,12 @@ TEST_F(test_cube_in_buffer_double_global_buffer, all_interface_normal) mm.SetInitParams(2, 2, 32, 32, kIter); mm.Init(1024, 4); GlobalTensor fakeInput; - mm.SetOrgAddr(fakeInput.address_); + mm.SetOrgTensor(fakeInput); LocalTensor fakeTensor; for (int32_t m = 0; m < mIter; m++) { for (int32_t n = 0; n < nIter; n++) { for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, n); + int32_t iterIndex = 0; if (mm.Hit(iterIndex)) { fakeTensor = mm.GetBuffer(iterIndex); hitCnt++; @@ -141,4 +135,4 @@ TEST_F(test_cube_in_buffer_double_global_buffer, all_interface_normal) } mm.Destroy(); ASSERT_EQ(hitCnt, 0); -} \ No newline at end of file +} diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp index 96a2fa6a25045d9ef72159c86bcc8f022531367f..96f5616d54494db2792b83dc8e7d98a31b288a15 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp @@ -14,15 +14,15 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: - using CubeInBufferA = matmul::CubeInBuffer, MM_CFG>; - using CubeInBufferB = matmul::CubeInBuffer, MM_CFG>; + using CubeInBufferA = Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = Impl::Detail::CubeInBuffer, MM_CFG>; }; constexpr MatmulConfig CFG_SPECIAL_BASIC = GetSpecialBasicConfig(32, 32, 32, 32, 32, 32, 2, 2); @@ -48,7 +48,7 @@ public: public: using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; template using CubeInBufferParams = @@ -93,12 +93,12 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using A_TYPE_IBSHARE = matmul::MatmulType; - using A_TYPE_BMM = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using A_TYPE_IBSHARE = MatmulType; + using A_TYPE_BMM = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; MatmulImpl mm2; diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp index 0e8b905c1a5af02c018fad8ad85e8dcf96de7a2f..0c625097787059f684183282e7ce3e3e9aadbfdb 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp @@ -14,23 +14,19 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template -class MatmulImpl: MATMUL_IMPORT_MODULE_PRIVATE(CubeInBufferParamsB) { - MATMUL_ALLOW_USING_PRIVATE(CubeInBufferParamsB); -public: - using CubeInBufferParamsB::GetMajorCacheNum; - using CubeInBufferParamsB::GetTotalCacheNum; - using CubeInBufferParamsB::GetCurKPos; - using CubeInBufferParamsB::GetCurMNPos; - using CubeInBufferParamsB::IsTailBlock; +class MatmulImpl +: MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +{ + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); public: using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() { InitVar(); @@ -71,22 +67,10 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; }; - -TEST_F(test_cube_in_buffer_params, all_interface) { - mm.SetInitParams(2, 2, 32, 32); - ASSERT_EQ(mm.GetMajorCacheNum(), 2); - ASSERT_EQ(mm.GetTotalCacheNum(), 4); - ASSERT_EQ(mm.GetCurKPos(0, 3), 0); - ASSERT_EQ(mm.GetCurMNPos(0, 3), 3); - mm.SetRuntimeParams(32, 32); - ASSERT_FALSE(mm.IsTailBlock()); - mm.SetRuntimeParams(32, 16); - ASSERT_TRUE(mm.IsTailBlock()); -} \ No newline at end of file diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp index d8efd9c26a36539ad12cfbc7a29a8fe175f8184a..fa117425823eee798c2ddf4d1f9d3665b09345b8 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp @@ -1,139 +1,134 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. - * - * @brief load data instruction ut for ascend910B1 - * - */ -#include -#include "kernel_operator.h" -#include "lib/matmul/tiling.h" -#include "impl/matmul/modules/matmul_param.h" -#include "impl/matmul/modules/matmul_policy.h" -#include "impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer.h" -#include "impl/matmul/modules/matmul_private_modules.h" - -using namespace std; -using namespace AscendC; -using namespace matmul; - -namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy -{ -public: - using CubeInBufferA = matmul::CubeInBuffer, MM_CFG>; - using CubeInBufferB = matmul::CubeInBuffer, MM_CFG>; -}; - -template -class MatmulImpl -: MATMUL_IMPORT_MODULE(CubeInBufferB) -, MATMUL_IMPORT_MODULE_PRIVATE(CubeInBufferParamsB) { - MATMUL_ALLOW_USING(CubeInBufferB); - MATMUL_ALLOW_USING_PRIVATE(CubeInBufferParamsB); - -public: - using CubeInBufferB::Init; - using CubeInBufferB::Destroy; - using CubeInBufferB::AllocTensor; - using CubeInBufferB::FreeTensor; - using CubeInBufferB::Hit; - using CubeInBufferB::GetBuffer; - using CubeInBufferB::Reset; - using CubeInBufferB::GetIterIndex; - using CubeInBufferB::EnQue; - using CubeInBufferB::DeQue; - -public: - using VAR_PARAMS = - typename MatmulParams::PARAMS; - template - using CubeInBufferParams = - typename AscendC::Conditional::type; - MatmulImpl() { - InitVar(); - } - - VAR_PARAMS& GetVar() { - return var; - } - - void InitVar() { - var.tiling_.SetTiling(&tiling); - var.tpipe_ = &pipe; - } - - void SetInitParams(int32_t stepN, int32_t stepKb, int32_t baseN, int32_t baseK) { - tiling.stepN = stepN; - tiling.stepKb = stepKb; - tiling.baseN = baseN; - tiling.baseK = baseK; - tiling.iterateOrder = 0; - } - - void SetRuntimeParams(int32_t baseUseN, int32_t baseUseK) { - var.baseUseN_ = baseUseN; - var.baseUseK_ = baseUseK; - } - -private: - TCubeTiling tiling; - TPipe pipe; - VAR_PARAMS var; -}; -} - -constexpr MatmulConfig MM_CFG_CUSTOM { true, false, false, 0, 0, 0, false, false, false, false, 0, 0, 0, 0, 0, 0, 0, 0, - false, false, false, false, false, true, BatchMode::NONE, true, true, true, true, true, true, true, - IterateMode::ITERATE_MODE_DEFAULT, false, true, false, true, IterateOrder::UNDEF, ScheduleType::INNER_PRODUCT, - false, true}; -class test_cube_in_buffer_single_buffer : public testing::Test { -protected: - void SetUp() {} - void TearDown() {} - -private: - using A_TYPE_BMM = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; - - MatmulImpl mm; -}; - -TEST_F(test_cube_in_buffer_single_buffer, get_iter_index) { - mm.SetInitParams(2, 2, 32, 32); - int32_t mIter = 2; - int32_t kIter = 3; - mm.Init(1024, 4); - ASSERT_EQ(mm.GetIterIndex(0, 1), 0); - ASSERT_EQ(mm.GetIterIndex(1, 1), 0); - ASSERT_EQ(mm.GetIterIndex(1, 2), 0); -} - -TEST_F(test_cube_in_buffer_single_buffer, all_interface_normal) { - mm.SetInitParams(2, 2, 32, 32); - int32_t mIter = 2; - int32_t kIter = 2; - int32_t hitCnt = 0; - mm.Init(1024, 4); - LocalTensor fakeTensor; - for (int32_t m = 0; m < mIter; m++) { - for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, k); - if (mm.Hit(iterIndex)) { - fakeTensor = mm.GetBuffer(iterIndex); - hitCnt++; - } else { - fakeTensor = mm.AllocTensor(iterIndex); - mm.EnQue(fakeTensor); - mm.DeQue(); - } - mm.FreeTensor(iterIndex, fakeTensor); - } - mm.Reset(); - } - mm.Destroy(); - ASSERT_EQ(hitCnt, 0); -} \ No newline at end of file +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" +#include "impl/matmul/modules/matmul_policy.h" +#include "impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer.h" +#include "impl/matmul/modules/matmul_private_modules.h" + +using namespace std; +using namespace AscendC; +using namespace Gemm; + +namespace { +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy +{ +public: + using CubeInBufferA = Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = Impl::Detail::CubeInBuffer, MM_CFG>; +}; + +template +class MatmulImpl +: MATMUL_IMPORT_MODULE(CubeInBufferB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +{ + MATMUL_ALLOW_USING(CubeInBufferB); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); + +public: + using CubeInBufferB::Init; + using CubeInBufferB::Destroy; + using CubeInBufferB::AllocTensor; + using CubeInBufferB::FreeTensor; + using CubeInBufferB::Hit; + using CubeInBufferB::GetBuffer; + using CubeInBufferB::Reset; + using CubeInBufferB::EnQue; + using CubeInBufferB::DeQue; + +public: + using VAR_PARAMS = + typename Impl::Detail::MatmulParams::PARAMS; + + MatmulImpl() { + InitVar(); + } + + VAR_PARAMS& GetVar() { + return var; + } + + void InitVar() { + var.tiling_.SetTiling(&tiling); + var.tpipe_ = &pipe; + } + + void SetInitParams(int32_t stepN, int32_t stepKb, int32_t baseN, int32_t baseK) { + tiling.stepN = stepN; + tiling.stepKb = stepKb; + tiling.baseN = baseN; + tiling.baseK = baseK; + tiling.iterateOrder = 0; + } + + void SetRuntimeParams(int32_t baseUseN, int32_t baseUseK) { + var.baseUseN_ = baseUseN; + var.baseUseK_ = baseUseK; + } + +private: + TCubeTiling tiling; + TPipe pipe; + VAR_PARAMS var; +}; +} + +constexpr MatmulConfig MM_CFG_CUSTOM { true, false, false, 0, 0, 0, false, false, false, false, 0, 0, 0, 0, 0, 0, 0, 0, + false, false, false, false, false, true, BatchMode::NONE, true, true, true, true, true, true, true, + IterateMode::ITERATE_MODE_DEFAULT, false, true, false, true, IterateOrder::UNDEF, ScheduleType::INNER_PRODUCT, + false, true}; +class test_cube_in_buffer_single_buffer : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} + +private: + using A_TYPE_BMM = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; + + MatmulImpl mm; +}; + +TEST_F(test_cube_in_buffer_single_buffer, DISABLED_get_iter_index) { + mm.SetInitParams(2, 2, 32, 32); + int32_t mIter = 2; + int32_t kIter = 3; + mm.Init(1024, 4); +} + +TEST_F(test_cube_in_buffer_single_buffer, DISABLED_all_interface_normal) { + mm.SetInitParams(2, 2, 32, 32); + int32_t mIter = 2; + int32_t kIter = 2; + int32_t hitCnt = 0; + mm.Init(1024, 4); + LocalTensor fakeTensor; + for (int32_t m = 0; m < mIter; m++) { + for (int32_t k = 0; k < kIter; k++) { + int32_t iterIndex = 0; + if (mm.Hit(iterIndex)) { + fakeTensor = mm.GetBuffer(iterIndex); + hitCnt++; + } else { + fakeTensor = mm.AllocTensor(iterIndex); + mm.EnQue(fakeTensor); + mm.DeQue(); + } + mm.FreeTensor(iterIndex, fakeTensor); + } + mm.Reset(); + } + mm.Destroy(); + ASSERT_EQ(hitCnt, 0); +} diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp index e59d0447d58e2e18f0c00e74963f985d4379b8f8..7af390d520e16d761b304fbf64fd6b20ce2c05f4 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp @@ -1,207 +1,202 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. - * - * @brief load data instruction ut for ascend910B1 - * - */ -#include -#include "kernel_operator.h" -#include "lib/matmul/tiling.h" -#include "impl/matmul/modules/matmul_param.h" -#include "impl/matmul/modules/matmul_policy.h" -#include "impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer.h" -#include "impl/matmul/modules/matmul_private_modules.h" - -using namespace std; -using namespace AscendC; -using namespace matmul; - -namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy -{ -public: - using CubeInBufferA = matmul::CubeInBuffer, MM_CFG>; - using CubeInBufferB = matmul::CubeInBuffer, MM_CFG>; -}; - -template -class MatmulImpl -: MATMUL_IMPORT_MODULE(CubeInBufferA) -, MATMUL_IMPORT_MODULE_PRIVATE(CubeInBufferParamsA) { - MATMUL_ALLOW_USING(CubeInBufferA); - MATMUL_ALLOW_USING_PRIVATE(CubeInBufferParamsA); - -public: - using CubeInBufferA::Init; - using CubeInBufferA::Destroy; - using CubeInBufferA::AllocTensor; - using CubeInBufferA::FreeTensor; - using CubeInBufferA::Hit; - using CubeInBufferA::GetBuffer; - using CubeInBufferA::Reset; - using CubeInBufferA::GetIterIndex; - using CubeInBufferA::EnQue; - using CubeInBufferA::DeQue; - using CubeInBufferA::SetOrgAddr; - -public: - using VAR_PARAMS = - typename MatmulParams::PARAMS; - template - using CubeInBufferParams = - typename AscendC::Conditional::type; - MatmulImpl() { - InitVar(); - } - - VAR_PARAMS& GetVar() { - return var; - } - - void InitVar() { - var.tiling_.SetTiling(&tiling); - var.tpipe_ = &pipe; - } - - void SetInitParams(int32_t stepM, int32_t stepKa, int32_t baseM, int32_t baseK, int32_t kIter) { - tiling.stepM = stepM; - tiling.stepKa = stepKa; - tiling.baseM = baseM; - tiling.baseK = baseK; - var.kIter_ = kIter; - tiling.iterateOrder = 0; - } - - void SetRuntimeParams(int32_t baseUseM, int32_t baseUseK) { - var.baseUseM_ = baseUseM; - var.baseUseK_ = baseUseK; - } - -private: - TCubeTiling tiling; - TPipe pipe; - VAR_PARAMS var; -}; -} - -class test_cube_in_buffer_single_global_buffer : public testing::Test { -protected: - void SetUp() {} - void TearDown() {} - -private: - using A_TYPE_IBSHARE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; - - MatmulImpl mm; - GlobalCache gCache; -}; - -TEST_F(test_cube_in_buffer_single_global_buffer, cube_in_buffer_global_get_iter_index) { - gCache.Init(); - int32_t mIter = 2; - int32_t kIter = 3; - mm.SetInitParams(2, 2, 32, 32, kIter); - mm.Init(1024, 4); - ASSERT_EQ(mm.GetIterIndex(0, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 1), 1); - ASSERT_EQ(mm.GetIterIndex(1, 2), 2); -} - -TEST_F(test_cube_in_buffer_single_global_buffer, all_interface_hit_none) { - gCache.Init(); - GlobalTensor fakeInput; - int32_t mIter = 2; - int32_t kIter = 2; - int32_t nIter = 2; - int32_t hitCnt = 0; - mm.SetInitParams(2, 2, 32, 32, kIter); - mm.Init(1024, 4); - mm.SetOrgAddr(fakeInput.address_); - LocalTensor fakeTensor; - for (int32_t m = 0; m < mIter; m++) { - for (int32_t n = 0; n < nIter; n++) { - for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, k); - if (mm.Hit(iterIndex)) { - fakeTensor = mm.GetBuffer(iterIndex); - hitCnt++; - } else { - fakeTensor = mm.AllocTensor(iterIndex); - mm.EnQue(fakeTensor); - mm.DeQue(); - } - } - } - mm.Reset(); - } - mm.Destroy(); - ASSERT_EQ(hitCnt, 0); -} - -TEST_F(test_cube_in_buffer_single_global_buffer, all_interface_normal_hit) { - gCache.Init(); - GlobalTensor fakeInput; - int32_t mIter = 2; - int32_t kIter = 2; - int32_t nIter = 2; - int32_t hitCnt = 0; - mm.SetInitParams(2, 2, 32, 32, kIter); - mm.Init(1024, 4); - mm.SetOrgAddr(fakeInput.address_); - mm.SetRuntimeParams(32, 32); - LocalTensor fakeTensor; - for (int32_t m = 0; m < mIter; m++) { - for (int32_t n = 0; n < nIter; n++) { - for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, k); - if (mm.Hit(iterIndex)) { - fakeTensor = mm.GetBuffer(iterIndex); - hitCnt++; - } else { - fakeTensor = mm.AllocTensor(iterIndex); - mm.EnQue(fakeTensor); - mm.DeQue(); - } - } - } - mm.Reset(); - } - mm.Destroy(); - ASSERT_EQ(hitCnt, 6); -} - -TEST_F(test_cube_in_buffer_single_global_buffer, all_interface_reduce_cache) { - gCache.Init(); - int32_t mIter = 2; - int32_t kIter = 2; - int32_t nIter = 2; - int32_t hitCnt = 0; - mm.SetInitParams(2, 2, 32, 32, kIter); - mm.Init(1024, 4); - GlobalTensor fakeInput; - mm.SetOrgAddr(fakeInput.address_); - LocalTensor fakeTensor; - for (int32_t m = 0; m < mIter; m++) { - for (int32_t n = 0; n < nIter; n++) { - for (int32_t k = 0; k < kIter; k++) { - int32_t iterIndex = mm.GetIterIndex(m, n); - if (mm.Hit(iterIndex)) { - fakeTensor = mm.GetBuffer(iterIndex); - hitCnt++; - } else { - fakeTensor = mm.AllocTensor(iterIndex); - mm.EnQue(fakeTensor); - mm.DeQue(); - } - } - } - mm.Reset(); - } - mm.Destroy(); - ASSERT_EQ(hitCnt, 6); -} \ No newline at end of file +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" +#include "impl/matmul/modules/matmul_policy.h" +#include "impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer.h" +#include "impl/matmul/modules/matmul_private_modules.h" + +using namespace std; +using namespace AscendC; +using namespace Gemm; + +namespace { +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy +{ +public: + using CubeInBufferA = Impl::Detail::CubeInBuffer, MM_CFG>; + using CubeInBufferB = Impl::Detail::CubeInBuffer, MM_CFG>; +}; + +template +class MatmulImpl +: MATMUL_IMPORT_MODULE(CubeInBufferA) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +{ + MATMUL_ALLOW_USING(CubeInBufferA); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); + +public: + using CubeInBufferA::Init; + using CubeInBufferA::Destroy; + using CubeInBufferA::AllocTensor; + using CubeInBufferA::FreeTensor; + using CubeInBufferA::Hit; + using CubeInBufferA::GetBuffer; + using CubeInBufferA::Reset; + using CubeInBufferA::EnQue; + using CubeInBufferA::DeQue; + using CubeInBufferA::SetOrgTensor; + +public: + using VAR_PARAMS = + typename Impl::Detail::MatmulParams::PARAMS; + + MatmulImpl() { + InitVar(); + } + + VAR_PARAMS& GetVar() { + return var; + } + + void InitVar() { + var.tiling_.SetTiling(&tiling); + var.tpipe_ = &pipe; + } + + void SetInitParams(int32_t stepM, int32_t stepKa, int32_t baseM, int32_t baseK, int32_t kIter) { + tiling.stepM = stepM; + tiling.stepKa = stepKa; + tiling.baseM = baseM; + tiling.baseK = baseK; + var.kIter_ = kIter; + tiling.iterateOrder = 0; + } + + void SetRuntimeParams(int32_t baseUseM, int32_t baseUseK) { + var.baseUseM_ = baseUseM; + var.baseUseK_ = baseUseK; + } + +private: + TCubeTiling tiling; + TPipe pipe; + VAR_PARAMS var; +}; +} + +class test_cube_in_buffer_single_global_buffer : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} + +private: + using A_TYPE_IBSHARE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; + + MatmulImpl mm; + Impl::Detail::GlobalCache gCache; +}; + +TEST_F(test_cube_in_buffer_single_global_buffer, DISABLED_cube_in_buffer_global_get_iter_index) { + gCache.Init(); + int32_t mIter = 2; + int32_t kIter = 3; + mm.SetInitParams(2, 2, 32, 32, kIter); + mm.Init(1024, 4); +} + +TEST_F(test_cube_in_buffer_single_global_buffer, DISABLED_all_interface_hit_none) { + gCache.Init(); + GlobalTensor fakeInput; + int32_t mIter = 2; + int32_t kIter = 2; + int32_t nIter = 2; + int32_t hitCnt = 0; + mm.SetInitParams(2, 2, 32, 32, kIter); + mm.Init(1024, 4); + mm.SetOrgTensor(fakeInput); + LocalTensor fakeTensor; + for (int32_t m = 0; m < mIter; m++) { + for (int32_t n = 0; n < nIter; n++) { + for (int32_t k = 0; k < kIter; k++) { + int32_t iterIndex = 0; + if (mm.Hit(iterIndex)) { + fakeTensor = mm.GetBuffer(iterIndex); + hitCnt++; + } else { + fakeTensor = mm.AllocTensor(iterIndex); + mm.EnQue(fakeTensor); + mm.DeQue(); + } + } + } + mm.Reset(); + } + mm.Destroy(); + ASSERT_EQ(hitCnt, 0); +} + +TEST_F(test_cube_in_buffer_single_global_buffer, DISABLED_all_interface_normal_hit) { + gCache.Init(); + GlobalTensor fakeInput; + int32_t mIter = 2; + int32_t kIter = 2; + int32_t nIter = 2; + int32_t hitCnt = 0; + mm.SetInitParams(2, 2, 32, 32, kIter); + mm.Init(1024, 4); + mm.SetOrgTensor(fakeInput); + mm.SetRuntimeParams(32, 32); + LocalTensor fakeTensor; + for (int32_t m = 0; m < mIter; m++) { + for (int32_t n = 0; n < nIter; n++) { + for (int32_t k = 0; k < kIter; k++) { + int32_t iterIndex = 0; + if (mm.Hit(iterIndex)) { + fakeTensor = mm.GetBuffer(iterIndex); + hitCnt++; + } else { + fakeTensor = mm.AllocTensor(iterIndex); + mm.EnQue(fakeTensor); + mm.DeQue(); + } + } + } + mm.Reset(); + } + mm.Destroy(); + ASSERT_EQ(hitCnt, 6); +} + +TEST_F(test_cube_in_buffer_single_global_buffer, DISABLED_all_interface_reduce_cache) { + gCache.Init(); + int32_t mIter = 2; + int32_t kIter = 2; + int32_t nIter = 2; + int32_t hitCnt = 0; + mm.SetInitParams(2, 2, 32, 32, kIter); + mm.Init(1024, 4); + GlobalTensor fakeInput; + mm.SetOrgTensor(fakeInput); + LocalTensor fakeTensor; + for (int32_t m = 0; m < mIter; m++) { + for (int32_t n = 0; n < nIter; n++) { + for (int32_t k = 0; k < kIter; k++) { + int32_t iterIndex = 0; + if (mm.Hit(iterIndex)) { + fakeTensor = mm.GetBuffer(iterIndex); + hitCnt++; + } else { + fakeTensor = mm.AllocTensor(iterIndex); + mm.EnQue(fakeTensor); + mm.DeQue(); + } + } + } + mm.Reset(); + } + mm.Destroy(); + ASSERT_EQ(hitCnt, 6); +} diff --git a/tests/matmul/test_matmul_config.cpp b/tests/matmul/test_matmul_config.cpp index 021e210983962055bdb03f8d0dfb5253105f96c8..8b1990fc4eaf32b14c56344792d57100fb9cb69d 100644 --- a/tests/matmul/test_matmul_config.cpp +++ b/tests/matmul/test_matmul_config.cpp @@ -1,54 +1,54 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. - * - * @brief load data instruction ut for ascend910B1 - * - */ -#include -#include "kernel_operator.h" -#include "lib/matmul/tiling.h" -#include "impl/matmul/modules/matmul_param.h" - -using namespace std; -using namespace AscendC; -using namespace matmul; - -class TestMatmulConfig : public testing::Test { -protected: - static void SetUpTestCase() {} - static void TearDownTestCase() {} - virtual void SetUp() {} - void TearDown() {} -}; - -TEST_F(TestMatmulConfig, TestParamsConfig) -{ - constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; - constexpr static MatmulShapeParams shapeParams{128, 128, 128, 64, 64, 64}; - constexpr static MatmulQuantParams quantParams{1, 1}; - constexpr static MatmulBatchParams batchParams{1, BatchMode::BATCH_LARGE_THAN_L1}; - constexpr static MatmulFuncParams funcParams{1, 1, 1, 1, 1, IterateOrder::ORDER_N, ScheduleType::OUTER_PRODUCT, - 1, 1}; - constexpr MatmulConfig mmConfig = GetMMConfig(shapeParams, quantParams, batchParams, funcParams); - - EXPECT_EQ(mmConfig.doNorm, true); - EXPECT_EQ(mmConfig.singleCoreM, 128); - EXPECT_EQ(mmConfig.singleCoreN, 128); - EXPECT_EQ(mmConfig.singleCoreK, 128); - EXPECT_EQ(mmConfig.basicM, 64); - EXPECT_EQ(mmConfig.basicN, 64); - EXPECT_EQ(mmConfig.basicK, 64); - EXPECT_EQ(mmConfig.isPerTensor, true); - EXPECT_EQ(mmConfig.hasAntiQuantOffset, true); - EXPECT_EQ(mmConfig.isNBatch, true); - EXPECT_EQ(mmConfig.batchMode, BatchMode::BATCH_LARGE_THAN_L1); - EXPECT_EQ(mmConfig.intrinsicsCheck, true); - EXPECT_EQ(mmConfig.enVecND2NZ, true); - EXPECT_EQ(mmConfig.enableDoubleCache, true); - EXPECT_EQ(mmConfig.enableL1CacheUB, true); - EXPECT_EQ(mmConfig.doMTE2Preload, 1); - EXPECT_EQ(mmConfig.iterateOrder, IterateOrder::ORDER_N); - EXPECT_EQ(mmConfig.scheduleType, ScheduleType::OUTER_PRODUCT); - EXPECT_EQ(mmConfig.enableReuse, true); - EXPECT_EQ(mmConfig.enableUBReuse, true); +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" + +using namespace std; +using namespace AscendC; +using namespace Gemm; + +class TestMatmulConfig : public testing::Test { +protected: + static void SetUpTestCase() {} + static void TearDownTestCase() {} + virtual void SetUp() {} + void TearDown() {} +}; + +TEST_F(TestMatmulConfig, TestParamsConfig) +{ + constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; + constexpr static MatmulShapeParams shapeParams{128, 128, 128, 64, 64, 64}; + constexpr static MatmulQuantParams quantParams{1, 1}; + constexpr static MatmulBatchParams batchParams{1, BatchMode::BATCH_LARGE_THAN_L1}; + constexpr static MatmulFuncParams funcParams{1, 1, 1, 1, 1, IterateOrder::ORDER_N, ScheduleType::OUTER_PRODUCT, + 1, 1}; + constexpr MatmulConfig mmConfig = GetMMConfig(shapeParams, quantParams, batchParams, funcParams); + + EXPECT_EQ(mmConfig.doNorm, true); + EXPECT_EQ(mmConfig.singleCoreM, 128); + EXPECT_EQ(mmConfig.singleCoreN, 128); + EXPECT_EQ(mmConfig.singleCoreK, 128); + EXPECT_EQ(mmConfig.basicM, 64); + EXPECT_EQ(mmConfig.basicN, 64); + EXPECT_EQ(mmConfig.basicK, 64); + EXPECT_EQ(mmConfig.isPerTensor, true); + EXPECT_EQ(mmConfig.hasAntiQuantOffset, true); + EXPECT_EQ(mmConfig.isNBatch, true); + EXPECT_EQ(mmConfig.batchMode, BatchMode::BATCH_LARGE_THAN_L1); + EXPECT_EQ(mmConfig.intrinsicsCheck, true); + EXPECT_EQ(mmConfig.enVecND2NZ, true); + EXPECT_EQ(mmConfig.enableDoubleCache, true); + EXPECT_EQ(mmConfig.enableL1CacheUB, true); + EXPECT_EQ(mmConfig.doMTE2Preload, 1); + EXPECT_EQ(mmConfig.iterateOrder, IterateOrder::ORDER_N); + EXPECT_EQ(mmConfig.scheduleType, ScheduleType::OUTER_PRODUCT); + EXPECT_EQ(mmConfig.enableReuse, true); + EXPECT_EQ(mmConfig.enableUBReuse, true); } \ No newline at end of file diff --git a/tests/matmul/test_matmul_iterate_controller.cpp b/tests/matmul/test_matmul_iterate_controller.cpp index 21e10a48ed2a9e0d08e4ba18986b550410ac5cb5..e02bb33c3ffac06fa905da7472143afc8777765c 100644 --- a/tests/matmul/test_matmul_iterate_controller.cpp +++ b/tests/matmul/test_matmul_iterate_controller.cpp @@ -12,12 +12,12 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; -using A_TYPE = matmul::MatmulType; -using B_TYPE = matmul::MatmulType; -using C_TYPE = matmul::MatmulType; -using BIAS_TYPE = matmul::MatmulType; +using A_TYPE = MatmulType; +using B_TYPE = MatmulType; +using C_TYPE = MatmulType; +using BIAS_TYPE = MatmulType; template class CustomCopyCubeIn { @@ -31,8 +31,8 @@ public: }; namespace { -template -class CustomMatmulPolicy : public matmul::MatmulPolicy +template +class CustomMatmulPolicy : public Impl::Detail::MatmulPolicy { public: using CopyCubeInA = CustomCopyCubeIn, MM_CFG>; @@ -45,7 +45,7 @@ class MatmulImpl , MATMUL_IMPORT_MODULE(CopyCubeInA) , MATMUL_IMPORT_MODULE(CopyCubeInB) { using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; MATMUL_ALLOW_USING_PRIVATE(IterateController); MATMUL_ALLOW_USING(CopyCubeInA); diff --git a/tests/matmul/test_matmul_l0c_buffer.cpp b/tests/matmul/test_matmul_l0c_buffer.cpp index 3c9852d1aef70a621d5e46ef3cc64d80b7abbc8f..3749c6d3f5df1b652c80e07c8b8d5cf4c1a5730a 100644 --- a/tests/matmul/test_matmul_l0c_buffer.cpp +++ b/tests/matmul/test_matmul_l0c_buffer.cpp @@ -13,7 +13,7 @@ using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { @@ -23,7 +23,7 @@ class MatmulImpl : MATMUL_IMPORT_MODULE(CubeOutBuffer) { MATMUL_ALLOW_USING(CubeOutBuffer); using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; public: using CubeOutBuffer::Destroy; @@ -64,10 +64,10 @@ protected: private: using L0cT = float; - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl enUnitFlagMM; MatmulImpl disUnitFlagMM; diff --git a/tests/matmul/test_matmul_shape_info.cpp b/tests/matmul/test_matmul_shape_info.cpp index f30a4c791506ce7ec1ea55864873f6115f5eeb02..7aa4e496f9f9a5c075555e0bd71a1621d05ac4bd 100644 --- a/tests/matmul/test_matmul_shape_info.cpp +++ b/tests/matmul/test_matmul_shape_info.cpp @@ -11,29 +11,28 @@ #include "impl/matmul/modules/matmul_policy.h" #include "impl/matmul/modules/matmul_private_modules.h" #define private public -#include "impl/matmul/modules/matmul_var.h" -#include "impl/matmul/modules/matmul_tensor_info.h" -#include "impl/matmul/modules/matmul_shape_tiling.h" +#include "impl/matmul/modules/param/matmul_tensor_info.h" +#include "impl/matmul/modules/param/matmul_shape_tiling.h" using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template class MatmulImpl -: MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfoB) +: MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfo) , MATMUL_IMPORT_MODULE_PRIVATE(MatmulTensorInfoB) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTilingB) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) { - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfoB); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo); MATMUL_ALLOW_USING_PRIVATE(MatmulTensorInfoB); - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTilingB); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); using SrcT = typename A_TYPE::T; public: using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() { InitVar(); } @@ -68,7 +67,11 @@ public: N_ = singleN; Ka_ = singleK; Kb_ = singleK; + Kc_ = singleN; } + var.mIter_ = singleM / 16; + var.nIter_ = singleN / 16; + var.kIter_ = singleK / 16; } void SetInitAParams(int32_t stepM, int32_t stepKa, int32_t baseM, int32_t baseK, int32_t depth, @@ -157,6 +160,7 @@ private: int32_t N_; int32_t Ka_; int32_t Kb_; + int32_t Kc_; int32_t batchA_; int32_t batchB_; @@ -198,10 +202,10 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; }; @@ -209,88 +213,17 @@ private: TEST_F(TestMatmulShapeInfo, test_get_singleCore_params) { mm.SetSingleCoreParams(32, 32, 16); - EXPECT_EQ(mm.GetOrgHeight(), 16); - EXPECT_EQ(mm.GetOrgWidth(), 32); - EXPECT_EQ(mm.GetSingleHeight(), 16); - EXPECT_EQ(mm.GetSingleWidth(), 32); + EXPECT_EQ(mm.GetSingleCoreM(), 32); + EXPECT_EQ(mm.GetSingleCoreN(), 32); - EXPECT_EQ(mm.template GetOrgHeight(), 32); - EXPECT_EQ(mm.template GetOrgWidth(), 16); - EXPECT_EQ(mm.template GetSingleHeight(), 32); - EXPECT_EQ(mm.template GetSingleWidth(), 16); -} - -TEST_F(TestMatmulShapeInfo, test_get_intrablock_singleCore_params) -{ - mm.SetSingleCoreParams(32, 32, 16, true); - auto res = mm.template GetOrgHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetOrgWidth(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 32); - - res = mm.template GetOrgHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetOrgWidth(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 16); -} - -TEST_F(TestMatmulShapeInfo, test_get_base_params) -{ - int32_t baseK = 16; - int32_t baseN = 32; - mm.SetInitBParams(2, 2, baseN, baseK, 1); - EXPECT_EQ(mm.GetBaseHeight(), baseK); - EXPECT_EQ(mm.GetBaseWidth(), baseN); - EXPECT_EQ(mm.GetBaseUseWidth(), baseN); - EXPECT_EQ(mm.GetBaseUseHeight(), baseK); - EXPECT_EQ(mm.GetBaseUseStepK(), 32); + EXPECT_EQ(mm.GetKIter(), 1); + EXPECT_EQ(mm.GetMIter(), 2); + EXPECT_EQ(mm.GetNIter(), 2); - EXPECT_EQ(mm.template GetBaseHeight(), baseN); - EXPECT_EQ(mm.template GetBaseWidth(), baseK); - EXPECT_EQ(mm.template GetBaseUseWidth(), baseK); - EXPECT_EQ(mm.template GetBaseUseHeight(), baseN); - EXPECT_EQ(mm.GetBaseUseStepK(), 32); -} + EXPECT_EQ(mm.GetOrgKa(), 16); + EXPECT_EQ(mm.GetOrgKa(), 16); + EXPECT_EQ(mm.GetOrgKc(), 32); + EXPECT_EQ(mm.GetOrgN(), 32); + EXPECT_EQ(mm.GetOrgM(), 32); -TEST_F(TestMatmulShapeInfo, test_get_iterate_params) -{ - int32_t baseK = 16; - mm.SetSingleCoreParams(32, 32, 32); - mm.SetInitBParams(2, 2, 32, baseK, 1); - EXPECT_EQ(mm.GetTotalRow(), 2); - EXPECT_EQ(mm.GetTotalCol(), 1); - EXPECT_EQ(mm.template GetTotalRow(), 1); - EXPECT_EQ(mm.template GetTotalCol(), 2); } - -TEST_F(TestMatmulShapeInfo, test_set_input_params) -{ - GlobalTensor fakeInput; - mm.SetGlobalAddr(fakeInput.address_, false); - EXPECT_FALSE(mm.IsTranspose()); - EXPECT_TRUE(mm.IsKRowDirec()); -} - -TEST_F(TestMatmulShapeInfo, test_set_intrablock_input_params) -{ - GlobalTensor fakeInput; - mm.template SetGlobalAddr(fakeInput.address_, false); - EXPECT_FALSE(mm.template IsTranspose()); - EXPECT_TRUE(mm.template IsKRowDirec()); -} \ No newline at end of file diff --git a/tests/matmul/test_matmul_shape_info_left.cpp b/tests/matmul/test_matmul_shape_info_left.cpp index 50fb4ac35dad0b10ce99bc938fe23ba7888e1047..53dcf48defa5af08b66eb10c18852d910e7ba735 100644 --- a/tests/matmul/test_matmul_shape_info_left.cpp +++ b/tests/matmul/test_matmul_shape_info_left.cpp @@ -11,28 +11,25 @@ #include "impl/matmul/modules/matmul_policy.h" #include "impl/matmul/modules/matmul_private_modules.h" #define private public -#include "impl/matmul/modules/matmul_var.h" -#include "impl/matmul/modules/matmul_shape_tiling.h" +#include "impl/matmul/modules/param/matmul_shape_tiling.h" using namespace std; using namespace AscendC; -using namespace matmul; +using namespace Gemm; namespace { template class MatmulImpl -: MATMUL_IMPORT_MODULE_PRIVATE(MatmulVarA) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTilingA) -, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfoA) +: MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeTiling) +, MATMUL_IMPORT_MODULE_PRIVATE(MatmulShapeInfo) { - MATMUL_ALLOW_USING_PRIVATE(MatmulVarA); - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTilingA); - MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfoA); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeTiling); + MATMUL_ALLOW_USING_PRIVATE(MatmulShapeInfo); using SrcT = typename A_TYPE::T; public: using VAR_PARAMS = - typename MatmulParams::PARAMS; + typename Impl::Detail::MatmulParams::PARAMS; MatmulImpl() { InitVar(); } @@ -67,7 +64,11 @@ public: N_ = singleN; Ka_ = singleK; Kb_ = singleK; + Kc_ = singleN; } + var.mIter_ = singleM / 16; + var.nIter_ = singleN / 16; + var.kIter_ = singleK / 16; } void SetInitAParams(int32_t stepM, int32_t stepKa, int32_t baseM, int32_t baseK, int32_t depth, bool isIntraBlock = false) { @@ -155,6 +156,7 @@ private: int32_t N_; int32_t Ka_; int32_t Kb_; + int32_t Kc_; int32_t batchA_; int32_t batchB_; @@ -196,10 +198,10 @@ protected: void TearDown() {} private: - using A_TYPE = matmul::MatmulType; - using B_TYPE = matmul::MatmulType; - using C_TYPE = matmul::MatmulType; - using BIAS_TYPE = matmul::MatmulType; + using A_TYPE = MatmulType; + using B_TYPE = MatmulType; + using C_TYPE = MatmulType; + using BIAS_TYPE = MatmulType; MatmulImpl mm; }; @@ -207,85 +209,16 @@ private: TEST_F(TestMatmulShapeInfoA, test_get_singleCore_params) { mm.SetSingleCoreParams(32, 32, 16); - EXPECT_EQ(mm.GetOrgHeight(), 32); - EXPECT_EQ(mm.GetOrgWidth(), 16); - EXPECT_EQ(mm.GetSingleHeight(), 32); - EXPECT_EQ(mm.GetSingleWidth(), 16); - - EXPECT_EQ(mm.template GetOrgHeight(), 16); - EXPECT_EQ(mm.template GetOrgWidth(), 32); - EXPECT_EQ(mm.template GetSingleHeight(), 16); - EXPECT_EQ(mm.template GetSingleWidth(), 32); -} - -TEST_F(TestMatmulShapeInfoA, test_get_intrablock_singleCore_params) -{ - mm.SetSingleCoreParams(32, 32, 16, true); - auto res = mm.template GetOrgHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetOrgWidth(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 16); - - res = mm.template GetOrgHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetOrgWidth(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 32); - res = mm.template GetSingleHeight(); - EXPECT_EQ(res, 16); - res = mm.template GetSingleWidth(); - EXPECT_EQ(res, 32); -} - -TEST_F(TestMatmulShapeInfoA, test_get_base_params) -{ - int32_t baseK = 16; - int32_t baseM = 32; - mm.SetInitAParams(2, 2, baseM, baseK, 1); - EXPECT_EQ(mm.GetBaseHeight(), baseM); - EXPECT_EQ(mm.GetBaseWidth(), baseK); - EXPECT_EQ(mm.GetBaseUseWidth(), baseK); - EXPECT_EQ(mm.GetBaseUseStepK(), 32); - - EXPECT_EQ(mm.template GetBaseHeight(), baseK); - EXPECT_EQ(mm.template GetBaseWidth(), baseM); - EXPECT_EQ(mm.template GetBaseUseWidth(), baseM); - EXPECT_EQ(mm.GetBaseUseStepK(), 32); - + EXPECT_EQ(mm.GetSingleCoreM(), 32); + EXPECT_EQ(mm.GetSingleCoreN(), 32); + + EXPECT_EQ(mm.GetKIter(), 1); + EXPECT_EQ(mm.GetMIter(), 2); + EXPECT_EQ(mm.GetNIter(), 2); + + EXPECT_EQ(mm.GetOrgKa(), 16); + EXPECT_EQ(mm.GetOrgKb(), 16); + EXPECT_EQ(mm.GetOrgKc(), 32); + EXPECT_EQ(mm.GetOrgN(), 32); + EXPECT_EQ(mm.GetOrgM(), 32); } - -TEST_F(TestMatmulShapeInfoA, test_get_iterate_params) -{ - int32_t baseK = 16; - mm.SetSingleCoreParams(32, 32, 32); - mm.SetInitAParams(2, 2, 32, baseK, 1); - EXPECT_EQ(mm.GetTotalRow(), 1); - EXPECT_EQ(mm.GetTotalCol(), 2); - EXPECT_EQ(mm.template GetTotalRow(), 2); - EXPECT_EQ(mm.template GetTotalCol(), 1); -} - -TEST_F(TestMatmulShapeInfoA, test_get_transpose_params) -{ - mm.SetTranspose(true, false); - EXPECT_TRUE(mm.IsTranspose()); - EXPECT_TRUE(mm.IsKRowDirec()); -} - -TEST_F(TestMatmulShapeInfoA, test_get_intrablock_transpose_params) -{ - mm.SetTranspose(true, false, true); - EXPECT_TRUE(mm.template IsTranspose()); - EXPECT_TRUE(mm.template IsKRowDirec()); -} \ No newline at end of file diff --git a/tests/matmul/test_operator_matmul_v200.cpp b/tests/matmul/test_operator_matmul_v200.cpp index 1b63634a675194b7663a4991cd6c338726afa455..f3efa9d000ccc107f08fd3b1659a3651bbde1c27 100644 --- a/tests/matmul/test_operator_matmul_v200.cpp +++ b/tests/matmul/test_operator_matmul_v200.cpp @@ -1,392 +1,392 @@ -/** - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#include -#include "kernel_operator.h" -#include "kernel_event.h" -#include "kernel_tiling/kernel_tiling.h" -#include "lib/matmul/tiling.h" -#include "lib/matmul/matmul.h" - -using namespace std; -using namespace AscendC; - -struct TilingParams { - __aicore__ TilingParams() {} - __aicore__ TilingParams(uint32_t coreNum, uint32_t M, uint32_t N, uint32_t K, uint32_t singleCoreM, - uint32_t singleCoreN, uint32_t singleCoreK, uint32_t baseM, uint32_t baseN, uint32_t baseK, uint32_t depthA1, - uint32_t depthB1, uint32_t stepM, uint32_t stepN, uint32_t stepKa, uint32_t stepKb, uint32_t isbias, - uint32_t iterateOrder) : coreNum_(coreNum), M_(M), N_(N), K_(K), - singleCoreM_(singleCoreM), singleCoreN_(singleCoreN), singleCoreK_(singleCoreK), baseM_(baseM), baseN_(baseN), - baseK_(baseK), depthA1_(depthA1), depthB1_(depthB1), stepM_(stepM), stepN_(stepN), stepKa_(stepKa), - stepKb_(stepKb), isbias_(isbias), iterateOrder_(iterateOrder) {} - __aicore__ void GetTiling(TCubeTiling &tiling) - { - tiling.usedCoreNum = coreNum_; - tiling.M = M_; - tiling.N = N_; - tiling.Ka = K_; - tiling.Kb = K_; - tiling.singleCoreM = singleCoreM_; - tiling.singleCoreN = singleCoreN_; - tiling.singleCoreK = singleCoreK_; - tiling.baseM = baseM_; - tiling.baseN = baseN_; - tiling.baseK = baseK_; - tiling.depthA1 = depthA1_; - tiling.depthB1 = depthB1_; - tiling.stepM = stepM_; - tiling.stepN = stepN_; - tiling.stepKa = stepKa_; - tiling.stepKb = stepKb_; - tiling.isBias = isbias_; - tiling.iterateOrder = iterateOrder_; - } - uint32_t coreNum_; - uint32_t M_; - uint32_t N_; - uint32_t K_; - uint32_t singleCoreM_; - uint32_t singleCoreN_; - uint32_t singleCoreK_; - uint32_t baseM_; - uint32_t baseN_; - uint32_t baseK_; - uint32_t depthA1_; - uint32_t depthB1_; - uint32_t stepM_; - uint32_t stepN_; - uint32_t stepKa_; - uint32_t stepKb_; - uint32_t isbias_; - uint32_t iterateOrder_; -}; - -template -__aicore__ inline int32_t CalcGMOffset(int blockIdx, int usedCoreNum, TCubeTiling& param, int& offsetA, int& offsetB, - int& offsetC, int& offsetBias, int32_t isTransposeAIn, int32_t isTransposeBIn) -{ - auto temp0 = ConstCeil(param.M, param.singleCoreM); - auto temp1 = ConstCeil(param.N, param.singleCoreN); - auto temp2 = ConstCeil(param.Ka, param.singleCoreK); // is 1 when k-axis is full loaded - - auto divideKcoreNum = usedCoreNum / temp2; - - auto mCoreIndx = (blockIdx % divideKcoreNum) % temp0; - auto nCoreIndx = (blockIdx % divideKcoreNum) / temp0; - auto subKindx = blockIdx / divideKcoreNum; // default 0 - - if constexpr (A_TYPE::format == CubeFormat::ND) { - if (isTransposeAIn > 0) { - offsetA = mCoreIndx * param.singleCoreM + subKindx * param.M * param.singleCoreK; - } else { - offsetA = mCoreIndx * param.Ka * param.singleCoreM + subKindx * param.singleCoreK; - } - } else if constexpr (A_TYPE::format == CubeFormat::NZ) { - offsetA = subKindx * param.singleCoreK * param.M + mCoreIndx * param.singleCoreM * BLOCK_CUBE; - } else if constexpr (A_TYPE::format == CubeFormat::SCALAR) { - // no need offsetA - } else if constexpr (A_TYPE::format == CubeFormat::VECTOR) { - // m only support 1, no need offsetA? - } else { - return -1; - } - - if constexpr (B_TYPE::format == CubeFormat::ND) { - if (isTransposeBIn > 0) { - offsetB = subKindx * param.singleCoreK + nCoreIndx * param.Ka * param.singleCoreN; - } else { - offsetB = subKindx * param.singleCoreK * param.N + nCoreIndx * param.singleCoreN; - } - } else if constexpr (B_TYPE::format == CubeFormat::NZ) { - offsetB = param.Kb * nCoreIndx * param.singleCoreN + subKindx * param.singleCoreK * BLOCK_CUBE; - } else { - return -1; - } - - if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { - offsetC = mCoreIndx * param.N * param.singleCoreM + nCoreIndx * param.singleCoreN; - } else if constexpr (C_TYPE::format == CubeFormat::NZ) { - offsetC = param.M * nCoreIndx * param.singleCoreN + mCoreIndx * param.singleCoreM * BLOCK_CUBE; - } else { - return -1; - } - - if constexpr (BIAS_TYPE::format == CubeFormat::ND) { - offsetBias = nCoreIndx * param.singleCoreN; - } else { - return -1; - } - - // tail M - int gmUseM = param.M - mCoreIndx * param.singleCoreM; - param.singleCoreM = gmUseM < param.singleCoreM ? gmUseM : param.singleCoreM; - - // tail N - int gmUseN = param.N - nCoreIndx * param.singleCoreN; - param.singleCoreN = gmUseN < param.singleCoreN ? gmUseN : param.singleCoreN; - - // tail K - int gmUseK = param.Ka - subKindx * param.singleCoreK; - param.singleCoreK = gmUseK < param.singleCoreK ? gmUseK : param.singleCoreK; - return 0; -} - -template -__aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_ADDR biasGM, - TilingParams &tilingParam, int32_t isTransposeAIn, int32_t isTransposeBIn, bool enSequentialWrite) -{ - using A_T = typename A_TYPE::T; - using B_T = typename B_TYPE::T; - using C_T = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - - set_atomic_none(); - TPipe que; - TCubeTiling tiling; - tilingParam.GetTiling(tiling); - -#if __CCE_AICORE__ == 200 - int A1Length = tiling.baseM * tiling.baseK * sizeof(A_T); - int B1Length = tiling.baseK * tiling.baseN * sizeof(B_T); - int CO1Length = tiling.baseM * tiling.baseN * sizeof(C_T); - tiling.transLength = std::max(std::max(A1Length, B1Length), CO1Length); -#endif - - bool isTransposeA = isTransposeAIn > 0 ? true : false; - bool isTransposeB = isTransposeBIn > 0 ? true : false; - if (block_idx >= tiling.usedCoreNum) { - return; - } - - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; - - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T*>(aGM), tiling.M * tiling.Ka); - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T*>(bGM), tiling.Kb * tiling.N); - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T*>(cGM), tiling.M * tiling.N); - biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasT*>(biasGM), tiling.N); - - GlobalTensor quantGlobal; - quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(biasGM), tiling.N); - - int offsetA = 0; - int offsetB = 0; - int offsetC = 0; - int offsetBias = 0; - CalcGMOffset(block_idx, tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC, - offsetBias, isTransposeAIn, isTransposeBIn); - - auto gmA = aGlobal[offsetA]; - auto gmB = bGlobal[offsetB]; - auto gmC = cGlobal[offsetC]; - auto gmBias = biasGlobal[offsetBias]; - - TQue leftMatrix; - TQue rightMatrix; - TQue biasQue; - TQue resultCMatrix; - - TQue qidA1; - TQue qidB1; - - matmul::MatmulImpl mm; - mm.SetSubBlockIdx(0); - mm.Init(&tiling, &que); - - LocalTensor bufferLeft; - LocalTensor bufferRight; - LocalTensor bufferC; - LocalTensor bufferBias; - -#if __CCE_AICORE__ == 200 - TBuf<> ubBuf; - LocalTensor workspaceBuffer; - que.InitBuffer(ubBuf, 131072); - workspaceBuffer = ubBuf.template Get(); - mm.SetLocalWorkspace(workspaceBuffer); -#endif - - if constexpr (A_TYPE::pos == TPosition::VECCALC) { - que.InitBuffer(leftMatrix, 1, tiling.M * tiling.Ka * 2); - bufferLeft = leftMatrix.AllocTensor(); - DataCopy(bufferLeft, gmA, tiling.M * tiling.Ka); - pipe_barrier(PIPE_ALL); - mm.SetTensorA(bufferLeft, isTransposeA); - } else if constexpr (A_TYPE::pos == TPosition::SHM) { - que.InitBuffer(qidA1, 1, tiling.M * tiling.Ka * sizeof(A_T)); - bufferLeft = qidA1.AllocTensor(); - int c0Size = 16; - if constexpr (sizeof(A_T) == sizeof(float)) { - c0Size = 8; - } - if (!isTransposeA) { - int blockLen = tiling.M * c0Size * sizeof(A_T) / ONE_BLK_SIZE; - DataCopy(bufferLeft, gmA, - { static_cast(tiling.Ka / c0Size), static_cast(blockLen), 0, 0 }); - } else { - int blockLen = tiling.Ka * c0Size * sizeof(A_T) / ONE_BLK_SIZE; - DataCopy(bufferLeft, gmA, - { static_cast(tiling.M / c0Size), static_cast(blockLen), 0, 0 }); - } - pipe_barrier(PIPE_ALL); - mm.SetTensorA(bufferLeft, isTransposeA); - } else if constexpr (A_TYPE::pos == TPosition::GM && A_TYPE::format == CubeFormat::SCALAR) { - A_T scalar_num = 2; - mm.SetTensorA(scalar_num); - } else { - mm.SetTensorA(gmA, isTransposeA); - } - - if constexpr (B_TYPE::pos == TPosition::VECCALC) { - que.InitBuffer(rightMatrix, 1, tiling.Kb * tiling.N * 2); - bufferRight = rightMatrix.AllocTensor(); - DataCopy(bufferRight, gmB, tiling.Kb * tiling.N); - pipe_barrier(PIPE_ALL); - mm.SetTensorB(bufferRight, isTransposeB); - } else if constexpr (B_TYPE::pos == TPosition::SHM) { - que.InitBuffer(qidB1, 1, tiling.Kb * tiling.N * sizeof(B_T)); - bufferRight = qidB1.AllocTensor(); - int c0Size = 16; - if constexpr (sizeof(B_T) == sizeof(float)) { - c0Size = 8; - } - if (!isTransposeB) { - int blockLen = tiling.Kb * c0Size * sizeof(B_T) / ONE_BLK_SIZE; - DataCopy(bufferRight, gmB, - { static_cast(tiling.N / c0Size), static_cast(blockLen), 0, 0 }); - } else { - int blockLen = tiling.N * c0Size * sizeof(B_T) / ONE_BLK_SIZE; - DataCopy(bufferRight, gmB, - { static_cast(tiling.Kb / c0Size), static_cast(blockLen), 0, 0 }); - } - pipe_barrier(PIPE_ALL); - mm.SetTensorB(bufferRight, isTransposeB); - } else { - mm.SetTensorB(gmB, isTransposeB); - } - - if constexpr (BIAS_TYPE::pos == TPosition::VECCALC) { - que.InitBuffer(biasQue, 1, tiling.N * 4); - bufferBias = biasQue.AllocTensor(); - DataCopy(bufferBias, gmBias, tiling.N); - pipe_barrier(PIPE_ALL); - if (tiling.isBias) { - mm.SetBias(bufferBias); - } - } else { - if (tiling.isBias) { - mm.SetBias(gmBias); - } - } - if constexpr ((IsSameType::value || IsSameType::value) - && IsSameType::value) { - mm.SetQuantVector(quantGlobal); - } - - if constexpr (C_TYPE::pos == TPosition::VECCALC) { -#if __CCE_AICORE__ < 220 - que.InitBuffer(resultCMatrix, 1, tiling.M * tiling.N * 4); - bufferC = resultCMatrix.AllocTensor(); - mm.IterateAll(bufferC); - pipe_barrier(PIPE_ALL); - DataCopy(gmC, bufferC, tiling.M * tiling.N); -#endif - } else { - while (mm.Iterate()) { - mm.GetTensorC(gmC, 0, enSequentialWrite); - } - } - pipe_barrier(PIPE_ALL); - if constexpr (A_TYPE::pos == TPosition::VECCALC) { - leftMatrix.FreeTensor(bufferLeft); - } - if constexpr (A_TYPE::pos == TPosition::SHM) { - qidA1.FreeTensor(bufferLeft); - } - if constexpr (B_TYPE::pos == TPosition::VECCALC) { - rightMatrix.FreeTensor(bufferRight); - } - if constexpr (B_TYPE::pos == TPosition::SHM) { - qidB1.FreeTensor(bufferRight); - } - if constexpr (BIAS_TYPE::pos == TPosition::VECCALC) { - biasQue.FreeTensor(bufferBias); - } - if constexpr (C_TYPE::pos == TPosition::VECCALC) { - resultCMatrix.FreeTensor(bufferC); - } - - set_atomic_none(); -} - -class TEST_KERNEL_MATMUL : public testing::Test { -protected: - void SetUp() {} - void TearDown() {} -}; - -#define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ - A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling) \ - namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling{ \ - typedef matmul::MatmulType aType; \ - typedef matmul::MatmulType bType; \ - typedef matmul::MatmulType cType; \ - typedef matmul::MatmulType biasType; \ - constexpr static MatmulConfig mmCFG = CFG_Mode; \ - constexpr static MatmulApiStaticTiling mmTiling = matmul::GetMatmulApiTiling(mmCFG); \ - TEST_F(TEST_KERNEL_MATMUL, Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling) \ - { \ - const int32_t left_data_size = tilingParams.M_ * tilingParams.K_; \ - const int32_t right_data_size = tilingParams.K_ * tilingParams.N_; \ - const int32_t bias_data_size = tilingParams.N_; \ - const int32_t output_data_size = tilingParams.M_ * tilingParams.N_; \ - uint8_t left_global[left_data_size * sizeof(A_DType)] = {0}; \ - uint8_t right_global[right_data_size * sizeof(B_DType)] = {0}; \ - uint8_t bias_global[bias_data_size * sizeof(BIAS_DType)] = {0}; \ - uint8_t output_global[output_data_size * sizeof(C_DType)] = {0};\ - if (enTiling) {\ - main_kernel_matmul(left_global, right_global, output_global, bias_global, tilingParams, isTransposeA, isTransposeB, enSequentialWrite);\ - }\ - else {\ - main_kernel_matmul(left_global, right_global, output_global, bias_global, tilingParams, isTransposeA, isTransposeB, enSequentialWrite);\ - }\ - for (int32_t i = 0; i < output_data_size * sizeof(C_DType); i++) { \ - EXPECT_EQ(output_global[i], 0x00); \ - } \ - } \ - } - - -// coreNum, M, N, K, singleCoreM, singleCoreN, singleCoreK, baseM, baseN, baseK, depthA1, depthB1, stepM, stepN, stepKa, stepKb, isBias, iterateOrder -TilingParams tiling_params_case1_310p = {1, 1, 128, 128, 64, 128, 128, 32, 32, 128, 1, 1, 1, 1, 1, 1, 1, 0}; -TilingParams tiling_params_case2_310p = {8, 64, 256, 256, 32, 64, 256, 32, 64, 256, 1, 1, 1, 1, 1, 1, 1, 0}; -TilingParams tiling_params_case3_310p = {1, 16, 32, 768, 16, 32, 768, 16, 32, 320, 3, 3, 1, 1, 3, 3, 0, 0}; -TilingParams tiling_params_case4_310p = {1, 64, 256, 256, 64, 256, 256, 32, 64, 256, 2, 4, 1, 2, 1, 1, 1, 0}; -TilingParams tiling_params_case5_310p = {1, 64, 256, 256, 64, 256, 256, 32, 64, 256, 2, 4, 1, 2, 1, 1, 1, 1}; -TilingParams tiling_params_case6_310p = {1, 16, 64, 32, 16, 64, 32, 16, 48, 32, 1, 2, 1, 2, 1, 1, 0, 0}; -TilingParams tiling_params_case7_310p = {1, 64, 128, 32, 64, 128, 32, 32, 32, 32, 2, 1, 1, 1, 1, 1, 0, 0}; -TilingParams tiling_params_case8_310p = {1, 32, 32, 32, 32, 32, 32, 32, 32, 32, 1, 1, 1, 1, 1, 1, 0, 0}; - -constexpr MatmulConfig MM_CFG_ENVEC = GetNormalConfig(false, false, true); -// TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case1_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); // MDL -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 0, CFG_MDL, false, false); // MDL + A trans -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 1, CFG_MDL, false, false); // MDL + B trans -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_NORM, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case3_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 1, CFG_MDL, false, false); // tail K + B trans -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case3_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 0, CFG_MDL, false, false); // tail K + B not trans -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case4_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case5_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case6_310p, VECCALC, VECCALC, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case6_310p, VECCALC, VECCALC, GM, GM, NZ, NZ, ND, ND, half, half, float, float, 0, 0, CFG_NORM, false, false); -KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case7_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, MM_CFG_ENVEC, false, false); \ No newline at end of file +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include "kernel_operator.h" +#include "kernel_event.h" +#include "kernel_tiling/kernel_tiling.h" +#include "lib/matmul/tiling.h" +#include "lib/matmul/matmul.h" + +using namespace std; +using namespace AscendC; + +struct TilingParams { + __aicore__ TilingParams() {} + __aicore__ TilingParams(uint32_t coreNum, uint32_t M, uint32_t N, uint32_t K, uint32_t singleCoreM, + uint32_t singleCoreN, uint32_t singleCoreK, uint32_t baseM, uint32_t baseN, uint32_t baseK, uint32_t depthA1, + uint32_t depthB1, uint32_t stepM, uint32_t stepN, uint32_t stepKa, uint32_t stepKb, uint32_t isbias, + uint32_t iterateOrder) : coreNum_(coreNum), M_(M), N_(N), K_(K), + singleCoreM_(singleCoreM), singleCoreN_(singleCoreN), singleCoreK_(singleCoreK), baseM_(baseM), baseN_(baseN), + baseK_(baseK), depthA1_(depthA1), depthB1_(depthB1), stepM_(stepM), stepN_(stepN), stepKa_(stepKa), + stepKb_(stepKb), isbias_(isbias), iterateOrder_(iterateOrder) {} + __aicore__ void GetTiling(TCubeTiling &tiling) + { + tiling.usedCoreNum = coreNum_; + tiling.M = M_; + tiling.N = N_; + tiling.Ka = K_; + tiling.Kb = K_; + tiling.singleCoreM = singleCoreM_; + tiling.singleCoreN = singleCoreN_; + tiling.singleCoreK = singleCoreK_; + tiling.baseM = baseM_; + tiling.baseN = baseN_; + tiling.baseK = baseK_; + tiling.depthA1 = depthA1_; + tiling.depthB1 = depthB1_; + tiling.stepM = stepM_; + tiling.stepN = stepN_; + tiling.stepKa = stepKa_; + tiling.stepKb = stepKb_; + tiling.isBias = isbias_; + tiling.iterateOrder = iterateOrder_; + } + uint32_t coreNum_; + uint32_t M_; + uint32_t N_; + uint32_t K_; + uint32_t singleCoreM_; + uint32_t singleCoreN_; + uint32_t singleCoreK_; + uint32_t baseM_; + uint32_t baseN_; + uint32_t baseK_; + uint32_t depthA1_; + uint32_t depthB1_; + uint32_t stepM_; + uint32_t stepN_; + uint32_t stepKa_; + uint32_t stepKb_; + uint32_t isbias_; + uint32_t iterateOrder_; +}; + +template +__aicore__ inline int32_t CalcGMOffset(int blockIdx, int usedCoreNum, TCubeTiling& param, int& offsetA, int& offsetB, + int& offsetC, int& offsetBias, int32_t isTransposeAIn, int32_t isTransposeBIn) +{ + auto temp0 = ConstCeil(param.M, param.singleCoreM); + auto temp1 = ConstCeil(param.N, param.singleCoreN); + auto temp2 = ConstCeil(param.Ka, param.singleCoreK); // is 1 when k-axis is full loaded + + auto divideKcoreNum = usedCoreNum / temp2; + + auto mCoreIndx = (blockIdx % divideKcoreNum) % temp0; + auto nCoreIndx = (blockIdx % divideKcoreNum) / temp0; + auto subKindx = blockIdx / divideKcoreNum; // default 0 + + if constexpr (A_TYPE::format == CubeFormat::ND) { + if (isTransposeAIn > 0) { + offsetA = mCoreIndx * param.singleCoreM + subKindx * param.M * param.singleCoreK; + } else { + offsetA = mCoreIndx * param.Ka * param.singleCoreM + subKindx * param.singleCoreK; + } + } else if constexpr (A_TYPE::format == CubeFormat::NZ) { + offsetA = subKindx * param.singleCoreK * param.M + mCoreIndx * param.singleCoreM * BLOCK_CUBE; + } else if constexpr (A_TYPE::format == CubeFormat::SCALAR) { + // no need offsetA + } else if constexpr (A_TYPE::format == CubeFormat::VECTOR) { + // m only support 1, no need offsetA? + } else { + return -1; + } + + if constexpr (B_TYPE::format == CubeFormat::ND) { + if (isTransposeBIn > 0) { + offsetB = subKindx * param.singleCoreK + nCoreIndx * param.Ka * param.singleCoreN; + } else { + offsetB = subKindx * param.singleCoreK * param.N + nCoreIndx * param.singleCoreN; + } + } else if constexpr (B_TYPE::format == CubeFormat::NZ) { + offsetB = param.Kb * nCoreIndx * param.singleCoreN + subKindx * param.singleCoreK * BLOCK_CUBE; + } else { + return -1; + } + + if constexpr (C_TYPE::format == CubeFormat::ND || C_TYPE::format == CubeFormat::ND_ALIGN) { + offsetC = mCoreIndx * param.N * param.singleCoreM + nCoreIndx * param.singleCoreN; + } else if constexpr (C_TYPE::format == CubeFormat::NZ) { + offsetC = param.M * nCoreIndx * param.singleCoreN + mCoreIndx * param.singleCoreM * BLOCK_CUBE; + } else { + return -1; + } + + if constexpr (BIAS_TYPE::format == CubeFormat::ND) { + offsetBias = nCoreIndx * param.singleCoreN; + } else { + return -1; + } + + // tail M + int gmUseM = param.M - mCoreIndx * param.singleCoreM; + param.singleCoreM = gmUseM < param.singleCoreM ? gmUseM : param.singleCoreM; + + // tail N + int gmUseN = param.N - nCoreIndx * param.singleCoreN; + param.singleCoreN = gmUseN < param.singleCoreN ? gmUseN : param.singleCoreN; + + // tail K + int gmUseK = param.Ka - subKindx * param.singleCoreK; + param.singleCoreK = gmUseK < param.singleCoreK ? gmUseK : param.singleCoreK; + return 0; +} + +template +__aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_ADDR biasGM, + TilingParams &tilingParam, int32_t isTransposeAIn, int32_t isTransposeBIn, bool enSequentialWrite) +{ + using A_T = typename A_TYPE::T; + using B_T = typename B_TYPE::T; + using C_T = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + set_atomic_none(); + TPipe que; + TCubeTiling tiling; + tilingParam.GetTiling(tiling); + +#if __CCE_AICORE__ == 200 + int A1Length = tiling.baseM * tiling.baseK * sizeof(A_T); + int B1Length = tiling.baseK * tiling.baseN * sizeof(B_T); + int CO1Length = tiling.baseM * tiling.baseN * sizeof(C_T); + tiling.transLength = std::max(std::max(A1Length, B1Length), CO1Length); +#endif + + bool isTransposeA = isTransposeAIn > 0 ? true : false; + bool isTransposeB = isTransposeBIn > 0 ? true : false; + if (block_idx >= tiling.usedCoreNum) { + return; + } + + GlobalTensor aGlobal; + GlobalTensor bGlobal; + GlobalTensor cGlobal; + GlobalTensor biasGlobal; + + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T*>(aGM), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T*>(bGM), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T*>(cGM), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasT*>(biasGM), tiling.N); + + GlobalTensor quantGlobal; + quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(biasGM), tiling.N); + + int offsetA = 0; + int offsetB = 0; + int offsetC = 0; + int offsetBias = 0; + CalcGMOffset(block_idx, tiling.usedCoreNum, tiling, offsetA, offsetB, offsetC, + offsetBias, isTransposeAIn, isTransposeBIn); + + auto gmA = aGlobal[offsetA]; + auto gmB = bGlobal[offsetB]; + auto gmC = cGlobal[offsetC]; + auto gmBias = biasGlobal[offsetBias]; + + TQue leftMatrix; + TQue rightMatrix; + TQue biasQue; + TQue resultCMatrix; + + TQue qidA1; + TQue qidB1; + + Gemm::MatmulImpl mm; + mm.SetSubBlockIdx(0); + mm.Init(&tiling, &que); + + LocalTensor bufferLeft; + LocalTensor bufferRight; + LocalTensor bufferC; + LocalTensor bufferBias; + +#if __CCE_AICORE__ == 200 + TBuf<> ubBuf; + LocalTensor workspaceBuffer; + que.InitBuffer(ubBuf, 131072); + workspaceBuffer = ubBuf.template Get(); + mm.SetLocalWorkspace(workspaceBuffer); +#endif + + if constexpr (A_TYPE::pos == TPosition::VECCALC) { + que.InitBuffer(leftMatrix, 1, tiling.M * tiling.Ka * 2); + bufferLeft = leftMatrix.AllocTensor(); + DataCopy(bufferLeft, gmA, tiling.M * tiling.Ka); + pipe_barrier(PIPE_ALL); + mm.SetTensorA(bufferLeft, isTransposeA); + } else if constexpr (A_TYPE::pos == TPosition::SHM) { + que.InitBuffer(qidA1, 1, tiling.M * tiling.Ka * sizeof(A_T)); + bufferLeft = qidA1.AllocTensor(); + int c0Size = 16; + if constexpr (sizeof(A_T) == sizeof(float)) { + c0Size = 8; + } + if (!isTransposeA) { + int blockLen = tiling.M * c0Size * sizeof(A_T) / ONE_BLK_SIZE; + DataCopy(bufferLeft, gmA, + { static_cast(tiling.Ka / c0Size), static_cast(blockLen), 0, 0 }); + } else { + int blockLen = tiling.Ka * c0Size * sizeof(A_T) / ONE_BLK_SIZE; + DataCopy(bufferLeft, gmA, + { static_cast(tiling.M / c0Size), static_cast(blockLen), 0, 0 }); + } + pipe_barrier(PIPE_ALL); + mm.SetTensorA(bufferLeft, isTransposeA); + } else if constexpr (A_TYPE::pos == TPosition::GM && A_TYPE::format == CubeFormat::SCALAR) { + A_T scalar_num = 2; + mm.SetTensorA(scalar_num); + } else { + mm.SetTensorA(gmA, isTransposeA); + } + + if constexpr (B_TYPE::pos == TPosition::VECCALC) { + que.InitBuffer(rightMatrix, 1, tiling.Kb * tiling.N * 2); + bufferRight = rightMatrix.AllocTensor(); + DataCopy(bufferRight, gmB, tiling.Kb * tiling.N); + pipe_barrier(PIPE_ALL); + mm.SetTensorB(bufferRight, isTransposeB); + } else if constexpr (B_TYPE::pos == TPosition::SHM) { + que.InitBuffer(qidB1, 1, tiling.Kb * tiling.N * sizeof(B_T)); + bufferRight = qidB1.AllocTensor(); + int c0Size = 16; + if constexpr (sizeof(B_T) == sizeof(float)) { + c0Size = 8; + } + if (!isTransposeB) { + int blockLen = tiling.Kb * c0Size * sizeof(B_T) / ONE_BLK_SIZE; + DataCopy(bufferRight, gmB, + { static_cast(tiling.N / c0Size), static_cast(blockLen), 0, 0 }); + } else { + int blockLen = tiling.N * c0Size * sizeof(B_T) / ONE_BLK_SIZE; + DataCopy(bufferRight, gmB, + { static_cast(tiling.Kb / c0Size), static_cast(blockLen), 0, 0 }); + } + pipe_barrier(PIPE_ALL); + mm.SetTensorB(bufferRight, isTransposeB); + } else { + mm.SetTensorB(gmB, isTransposeB); + } + + if constexpr (BIAS_TYPE::pos == TPosition::VECCALC) { + que.InitBuffer(biasQue, 1, tiling.N * 4); + bufferBias = biasQue.AllocTensor(); + DataCopy(bufferBias, gmBias, tiling.N); + pipe_barrier(PIPE_ALL); + if (tiling.isBias) { + mm.SetBias(bufferBias); + } + } else { + if (tiling.isBias) { + mm.SetBias(gmBias); + } + } + if constexpr ((IsSameType::value || IsSameType::value) + && IsSameType::value) { + mm.SetQuantVector(quantGlobal); + } + + if constexpr (C_TYPE::pos == TPosition::VECCALC) { +#if __CCE_AICORE__ < 220 + que.InitBuffer(resultCMatrix, 1, tiling.M * tiling.N * 4); + bufferC = resultCMatrix.AllocTensor(); + mm.IterateAll(bufferC); + pipe_barrier(PIPE_ALL); + DataCopy(gmC, bufferC, tiling.M * tiling.N); +#endif + } else { + while (mm.Iterate()) { + mm.GetTensorC(gmC, 0, enSequentialWrite); + } + } + pipe_barrier(PIPE_ALL); + if constexpr (A_TYPE::pos == TPosition::VECCALC) { + leftMatrix.FreeTensor(bufferLeft); + } + if constexpr (A_TYPE::pos == TPosition::SHM) { + qidA1.FreeTensor(bufferLeft); + } + if constexpr (B_TYPE::pos == TPosition::VECCALC) { + rightMatrix.FreeTensor(bufferRight); + } + if constexpr (B_TYPE::pos == TPosition::SHM) { + qidB1.FreeTensor(bufferRight); + } + if constexpr (BIAS_TYPE::pos == TPosition::VECCALC) { + biasQue.FreeTensor(bufferBias); + } + if constexpr (C_TYPE::pos == TPosition::VECCALC) { + resultCMatrix.FreeTensor(bufferC); + } + + set_atomic_none(); +} + +class TEST_KERNEL_MATMUL : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} +}; + +#define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ + A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling) \ + namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling{ \ + typedef Gemm::MatmulType aType; \ + typedef Gemm::MatmulType bType; \ + typedef Gemm::MatmulType cType; \ + typedef Gemm::MatmulType biasType; \ + constexpr static MatmulConfig mmCFG = CFG_Mode; \ + constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling(mmCFG); \ + TEST_F(TEST_KERNEL_MATMUL, Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling) \ + { \ + const int32_t left_data_size = tilingParams.M_ * tilingParams.K_; \ + const int32_t right_data_size = tilingParams.K_ * tilingParams.N_; \ + const int32_t bias_data_size = tilingParams.N_; \ + const int32_t output_data_size = tilingParams.M_ * tilingParams.N_; \ + uint8_t left_global[left_data_size * sizeof(A_DType)] = {0}; \ + uint8_t right_global[right_data_size * sizeof(B_DType)] = {0}; \ + uint8_t bias_global[bias_data_size * sizeof(BIAS_DType)] = {0}; \ + uint8_t output_global[output_data_size * sizeof(C_DType)] = {0};\ + if (enTiling) {\ + main_kernel_matmul(left_global, right_global, output_global, bias_global, tilingParams, isTransposeA, isTransposeB, enSequentialWrite);\ + }\ + else {\ + main_kernel_matmul(left_global, right_global, output_global, bias_global, tilingParams, isTransposeA, isTransposeB, enSequentialWrite);\ + }\ + for (int32_t i = 0; i < output_data_size * sizeof(C_DType); i++) { \ + EXPECT_EQ(output_global[i], 0x00); \ + } \ + } \ + } + + +// coreNum, M, N, K, singleCoreM, singleCoreN, singleCoreK, baseM, baseN, baseK, depthA1, depthB1, stepM, stepN, stepKa, stepKb, isBias, iterateOrder +TilingParams tiling_params_case1_310p = {1, 1, 128, 128, 64, 128, 128, 32, 32, 128, 1, 1, 1, 1, 1, 1, 1, 0}; +TilingParams tiling_params_case2_310p = {8, 64, 256, 256, 32, 64, 256, 32, 64, 256, 1, 1, 1, 1, 1, 1, 1, 0}; +TilingParams tiling_params_case3_310p = {1, 16, 32, 768, 16, 32, 768, 16, 32, 320, 3, 3, 1, 1, 3, 3, 0, 0}; +TilingParams tiling_params_case4_310p = {1, 64, 256, 256, 64, 256, 256, 32, 64, 256, 2, 4, 1, 2, 1, 1, 1, 0}; +TilingParams tiling_params_case5_310p = {1, 64, 256, 256, 64, 256, 256, 32, 64, 256, 2, 4, 1, 2, 1, 1, 1, 1}; +TilingParams tiling_params_case6_310p = {1, 16, 64, 32, 16, 64, 32, 16, 48, 32, 1, 2, 1, 2, 1, 1, 0, 0}; +TilingParams tiling_params_case7_310p = {1, 64, 128, 32, 64, 128, 32, 32, 32, 32, 2, 1, 1, 1, 1, 1, 0, 0}; +TilingParams tiling_params_case8_310p = {1, 32, 32, 32, 32, 32, 32, 32, 32, 32, 1, 1, 1, 1, 1, 1, 0, 0}; + +constexpr MatmulConfig MM_CFG_ENVEC = GetNormalConfig(false, false, true); +// TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case1_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); // MDL +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 0, CFG_MDL, false, false); // MDL + A trans +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 1, CFG_MDL, false, false); // MDL + B trans +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_NORM, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case2_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case3_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 1, CFG_MDL, false, false); // tail K + B trans +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case3_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 1, 0, CFG_MDL, false, false); // tail K + B not trans +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case4_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case5_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case6_310p, VECCALC, VECCALC, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, CFG_MDL, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case6_310p, VECCALC, VECCALC, GM, GM, NZ, NZ, ND, ND, half, half, float, float, 0, 0, CFG_NORM, false, false); +KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tiling_params_case7_310p, GM, GM, GM, GM, ND, ND, ND, ND, half, half, float, float, 0, 0, MM_CFG_ENVEC, false, false); diff --git a/tests/matmul/test_operator_matmul_v220.cpp b/tests/matmul/test_operator_matmul_v220.cpp index fe18d98d07dbb9d324680ffabb1e084383938f76..c4d0f34d4b933960f2de3bcb1d204d84eea20366 100644 --- a/tests/matmul/test_operator_matmul_v220.cpp +++ b/tests/matmul/test_operator_matmul_v220.cpp @@ -201,7 +201,7 @@ __aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, TQue qidA1; TQue qidB1; - matmul::MatmulImpl mm; + Gemm::MatmulImpl mm; mm.SetSubBlockIdx(0); mm.Init(&tiling, &que); @@ -375,12 +375,12 @@ protected: #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling) \ namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling{ \ - typedef matmul::MatmulType aType; \ - typedef matmul::MatmulType bType; \ - typedef matmul::MatmulType cType; \ - typedef matmul::MatmulType biasType; \ + typedef Gemm::MatmulType aType; \ + typedef Gemm::MatmulType bType; \ + typedef Gemm::MatmulType cType; \ + typedef Gemm::MatmulType biasType; \ constexpr static MatmulConfig mmCFG = CFG_Mode; \ - constexpr static MatmulApiStaticTiling mmTiling = matmul::GetMatmulApiTiling(mmCFG); \ + constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling(mmCFG); \ TEST_F(TEST_KERNEL_MATMUL, Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling) \ { \ const int32_t left_data_size = tilingParams.M_ * tilingParams.K_; \ diff --git a/tests/matmul/test_operator_matmul_v300.cpp b/tests/matmul/test_operator_matmul_v300.cpp index 9f74ab819c69cc51ef71e9fa037bbe4abfd881e0..3dae363500f2aab4b7e548a1885c7f3e3da8e85c 100644 --- a/tests/matmul/test_operator_matmul_v300.cpp +++ b/tests/matmul/test_operator_matmul_v300.cpp @@ -229,7 +229,7 @@ __aicore__ inline void kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_A set_atomic_none(); - matmul::MatmulImpl mm; + Gemm::MatmulImpl mm; if constexpr(mmMatmul) { REGIST_MATMUL_OBJ(&que, GetSysWorkSpacePtr(), mm); mm.Init(&tiling); @@ -309,10 +309,10 @@ TilingParams g_tilingParams = { 1, 16, 32, 32, 16, 32, 32, 16, 32, 32, 1, 1, 1, TEST_F(TEST_KERNEL_MATMUL, \ Kernel_Matmul_Case##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##MM_Matmul) \ { \ - typedef matmul::MatmulType aType; \ - typedef matmul::MatmulType bType; \ - typedef matmul::MatmulType cType; \ - typedef matmul::MatmulType biasType; \ + typedef Gemm::MatmulType aType; \ + typedef Gemm::MatmulType bType; \ + typedef Gemm::MatmulType cType; \ + typedef Gemm::MatmulType biasType; \ TilingParams tilingParam = tilingParams; \ const int32_t left_data_size = tilingParam.M_ * tilingParam.K_; \ const int32_t right_data_size = tilingParam.K_ * tilingParam.N_; \