From fde91fde4e8bbd8b141b07e19e42f1f690d316d9 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 03:46:22 +0000 Subject: [PATCH 1/8] matmul api iterateNorm refactory && fix int4 size Signed-off-by: jiangchengcheng-on --- impl/matmul/matmul_impl.h | 293 +++++++------ impl/matmul/matmul_macro_def.h | 89 ++++ impl/matmul/matmul_server.h | 6 +- impl/matmul/matmul_tiling_algorithm.cpp | 10 +- lib/matmul/matmul.h | 550 ++---------------------- lib/matmul/tiling.h | 94 ++++ 6 files changed, 372 insertions(+), 670 deletions(-) create mode 100644 impl/matmul/matmul_macro_def.h diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index e61f9a91..2642d7df 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -262,8 +262,10 @@ __aicore__ inline void MatmulImpl= 220 - if constexpr ((IsSameType::value && IsSameType::value) || - ((IsSameType::value || IsSameType::value) && IsSameType::value)) { + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value)) { var.quantScalar_ = quantScalar; if constexpr (IsSameType::value) { var.quantMode_ = 1; @@ -303,8 +305,10 @@ __aicore__ inline void MatmulImpl& quantTensor) { #if __CCE_AICORE__ >= 220 - if constexpr ((IsSameType::value && IsSameType::value) || - ((IsSameType::value || IsSameType::value) && IsSameType::value)) { + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value)) { var.quantTensor_ = quantTensor; if constexpr (IsSameType::value) { var.quantMode_ = 2; @@ -1022,6 +1026,11 @@ __aicore__ inline void MatmulImpl::value) { + aMatrixByteSize = aMatrixByteSize / 2; + bMatrixByteSize = bMatrixByteSize / 2; + } if constexpr (!PhyPosIsL1(A_TYPE::pos)) { if (var.tiling_->depthA1 > DB_FACTOR) { if (var.tiling_->depthA1 < var.kIter_ * var.tiling_->stepM) { @@ -1124,7 +1133,8 @@ __aicore__ inline void MatmulImplInitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); } #endif - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t)); @@ -1248,9 +1258,10 @@ __aicore__ inline void MatmulImpl(var.tiling_->shareUbSize); #if __CCE_AICORE__ == 200 - shareUbSize = 0; + shareUbSize = 0; #endif uint32_t shareLens[3] = {static_cast(var.tiling_->shareL1Size), static_cast(var.tiling_->shareL0CSize), shareUbSize}; @@ -1285,6 +1296,11 @@ __aicore__ inline void MatmulImpl::value) { + aMatrixByteSize = aMatrixByteSize / 2; + bMatrixByteSize = bMatrixByteSize / 2; + } if constexpr (!PhyPosIsL1(A_TYPE::pos)) { uint32_t cacheA1Size = var.tiling_->stepM * var.tiling_->stepKa; @@ -1345,7 +1361,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { @@ -1449,6 +1466,11 @@ __aicore__ inline void MatmulImpl::value) { + aMatrixByteSize = aMatrixByteSize / 2; + bMatrixByteSize = bMatrixByteSize / 2; + } if constexpr (A_TYPE::ibShare) { ASCENDC_ASSERT((B_TYPE::ibShare == false), { @@ -1572,7 +1594,8 @@ __aicore__ inline void MatmulImplisBias) { var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT)); } - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t)); @@ -1714,7 +1737,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.qidFixPipe_.FreeAllEvent(); @@ -1749,7 +1773,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.qidFixPipe_.FreeAllEvent(); @@ -1796,7 +1821,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value))) { var.qidFixPipe_.FreeAllEvent(); @@ -1925,7 +1951,11 @@ __aicore__ inline void MatmulImpl @@ -1944,7 +1974,11 @@ __aicore__ inline void MatmulImpl= 220 @@ -2055,7 +2089,11 @@ __aicore__ inline void MatmulImpl @@ -2074,7 +2112,11 @@ __aicore__ inline void MatmulImpl @@ -2087,7 +2129,11 @@ __aicore__ inline void MatmulImpl @@ -2100,7 +2146,11 @@ __aicore__ inline void MatmulImpl @@ -3672,7 +3722,8 @@ __aicore__ inline void MatmulImpl l1TmpForQuant; - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value)) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ % 2 == 0) { @@ -3739,7 +3790,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantPre = QuantMode_t::DEQF16; fixpipeParams.deqScalar = var.quantScalar_; @@ -3777,7 +3829,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantParams = { QuantMode_t::DEQF16, var.quantScalar_ }; Fixpipe(gm, co1Local, fixpipeParams); @@ -3841,7 +3894,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ == 1) { @@ -3888,7 +3942,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ == 1) { @@ -3962,7 +4017,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ == 1) { @@ -4010,7 +4066,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ == 1) { @@ -4083,7 +4140,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantPre = QuantMode_t::DEQF16; fixpipeParams.deqScalar = var.quantScalar_; @@ -4122,7 +4180,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantParams = { QuantMode_t::DEQF16, var.quantScalar_ }; Fixpipe(gm[dstOffset], co1Local, fixpipeParams); @@ -4160,7 +4219,8 @@ __aicore__ inline void MatmulImpl::value && (IsSameType::value || IsSameType::value || IsSameType::value)) || ((IsSameType::value || IsSameType::value) - && IsSameType::value)) { + && IsSameType::value) || + (IsSameType::value && IsSameType::value)) { if (var.quantMode_ == 2 || var.quantMode_ == 4 || var.quantMode_ == 6) { var.qidFixPipe_.FreeTensor(l1TmpForQuant); } @@ -4608,49 +4668,51 @@ __aicore__ inline void MatmulImpl -__aicore__ inline TBufHandle MatmulImpl::GetCacheA1Buf(bool isPong) +__aicore__ inline auto& MatmulImpl::GetCacheA1(bool isPong) { - return isPong ? var.cacheA1BufPong_ : var.cacheA1BufPing_; + return isPong ? var.cacheA1Pong_ : var.cacheA1Ping_; }; template -__aicore__ inline TBufHandle MatmulImpl::GetCacheB1Buf(bool isPong) +__aicore__ inline auto& MatmulImpl::GetCacheB1(bool isPong) { - return isPong ? var.cacheB1BufPong_ : var.cacheB1BufPing_; + return isPong ? var.cacheB1Pong_ : var.cacheB1Ping_; }; template __aicore__ inline bool MatmulImpl::GetCacheA1IsCaching(bool isPong) + const { return isPong ? var.cacheA1IsCachingPong_ : var.cacheA1IsCachingPing_; }; template __aicore__ inline bool MatmulImpl::GetCacheB1IsCaching(bool isPong) + const { return isPong ? var.cacheB1IsCachingPong_ : var.cacheB1IsCachingPing_; }; template -__aicore__ inline void MatmulImpl::SetCacheA1Buf( - bool isPong, TBufHandle buf) +__aicore__ inline void MatmulImpl::SetCacheA1( + bool isPong, const LocalTensor& a1) { if (isPong) { - var.cacheA1BufPong_ = buf; + var.cacheA1Pong_ = a1; } else { - var.cacheA1BufPing_ = buf; + var.cacheA1Ping_ = a1; } return; }; template -__aicore__ inline void MatmulImpl::SetCacheB1Buf( - bool isPong, TBufHandle buf) +__aicore__ inline void MatmulImpl::SetCacheB1( + bool isPong, const LocalTensor& b1) { if (isPong) { - var.cacheB1BufPong_ = buf; + var.cacheB1Pong_ = b1; } else { - var.cacheB1BufPing_ = buf; + var.cacheB1Ping_ = b1; } return; }; @@ -5953,7 +6015,7 @@ __aicore__ inline void MatmulImpl var.stepKaIdx_) { int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_; if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosKa)) { - var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosKa)); + var.qidA1_.FreeTensor(GetCacheA1(cachePosKa)); SetCacheA1IsCaching(cachePosKa, false); } } @@ -5962,7 +6024,7 @@ __aicore__ inline void MatmulImpl var.stepKbIdx_) { int cachePosKb = var.stepKbIdx_ & var.cacheB1Factor_; if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosKb)) { - var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosKb)); + var.qidB1_.FreeTensor(GetCacheB1(cachePosKb)); SetCacheB1IsCaching(cachePosKb, false); } } @@ -6165,21 +6227,21 @@ __aicore__ inline void MatmulImpl var.stepKaIdx_) { int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_; if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosKa)) { - var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosKa)); + var.qidA1_.FreeTensor(GetCacheA1(cachePosKa)); SetCacheA1IsCaching(cachePosKa, false); } } @@ -6315,7 +6377,7 @@ __aicore__ inline void MatmulImpl var.stepKbIdx_) { int cachePosKb = var.stepKbIdx_ & var.cacheB1Factor_; if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosKb)) { - var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosKb)); + var.qidB1_.FreeTensor(GetCacheB1(cachePosKb)); SetCacheB1IsCaching(cachePosKb, false); } } @@ -6479,21 +6541,21 @@ __aicore__ inline void MatmulImpl var.tiling_->stepM ? var.tiling_->stepM : (var.mIter_ - var.curM_); - var.curStepN_ = (var.nIter_ - var.curN_) > var.tiling_->stepN ? var.tiling_->stepN : (var.nIter_ - var.curN_); - } else if (likely(var.tiling_->iterateOrder == static_cast(IterateOrder::ORDER_M))) { // Output along M axis - if (++var.curN_ >= var.stepNIdx_ + var.curStepN_) { - if constexpr (!PhyPosIsL1(A_TYPE::pos)) { - if (var.cacheProcA_ > 0) { - var.qidA1Cache_.FreeTensor(var.cacheHeadA1_); - var.cacheProcA_ = 0; - } - } - var.curN_ = var.stepNIdx_; - if (++var.curM_ >= var.mIter_) { - if constexpr (!PhyPosIsL1(B_TYPE::pos)) { - if (var.cacheProcB_ > 0) { - var.qidB1Cache_.FreeTensor(var.cacheHeadB1_); - var.cacheProcB_ = 0; - } - } - var.curM_ = 0; - var.stepNIdx_ += var.curStepN_; - if (var.stepNIdx_ >= var.nIter_) { - return false; - } - var.curN_ = var.stepNIdx_; - var.curStepN_ = - (var.nIter_ - var.curN_) > var.tiling_->stepN ? var.tiling_->stepN : (var.nIter_ - var.curN_); - } - } - } else { - ASCENDC_ASSERT((var.tiling_->iterateOrder == static_cast(IterateOrder::ORDER_N)), { - KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", var.tiling_->iterateOrder); - }); - if (++var.curM_ >= var.stepMIdx_ + var.curStepM_) { - if constexpr (!PhyPosIsL1(B_TYPE::pos)) { - if (var.cacheProcB_ > 0) { - var.qidB1Cache_.FreeTensor(var.cacheHeadB1_); - var.cacheProcB_ = 0; - } - } - var.curM_ = var.stepMIdx_; - if (++var.curN_ >= var.nIter_) { - if constexpr (!PhyPosIsL1(A_TYPE::pos)) { - if (var.cacheProcA_ > 0) { - var.qidA1Cache_.FreeTensor(var.cacheHeadA1_); - var.cacheProcA_ = 0; - } - } - var.curN_ = 0; - var.stepMIdx_ += var.curStepM_; - if (var.stepMIdx_ >= var.mIter_) { - return false; - } - var.curM_ = var.stepMIdx_; - var.curStepM_ = - (var.mIter_ - var.curM_) > var.tiling_->stepM ? var.tiling_->stepM : (var.mIter_ - var.curM_); - } - } + + if (!MATMUL_MODULE(MatmulIterateController).MoveNext()) { + return false; } // Initializing variables var.baseUseM_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_->baseM; @@ -7015,14 +7017,14 @@ __aicore__ inline bool MatmulImpl::LoadToAL1MDL(int r if (insertDeQue) { var.qidA1_.DeQue(); } - SetCacheA1Buf(cachePosA, a1.GetBufferHandle()); + SetCacheA1(cachePosA, a1); SetCacheA1IsCaching(cachePosA, true); } else { DEBUG_CODE(++a1LoadCacheCount_); - a1.SetAddr(var.qidA1_.GetBufferAddr(GetCacheA1Buf(cachePosA))); + a1 = GetCacheA1(cachePosA); } } else { int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_; @@ -7652,11 +7654,11 @@ MatmulImpl::LoadToAL1MDL(int r OnCopyInA1(a1, row, col, var.baseUseStepM_, var.baseUseStepKa_); var.qidA1_.EnQue(a1); var.qidA1_.DeQue(); - SetCacheA1Buf(cachePosKa, a1.GetBufferHandle()); + SetCacheA1(cachePosKa, a1); SetCacheA1IsCaching(cachePosKa, true); } else { DEBUG_CODE(++a1LoadCacheCount_); - a1.SetAddr(var.qidA1_.GetBufferAddr(GetCacheA1Buf(cachePosKa))); + a1 = GetCacheA1(cachePosKa); } } return a1; @@ -7871,14 +7873,14 @@ MatmulImpl::LoadToBL1MDL(int r if (insertDeQue) { var.qidB1_.DeQue(); } - SetCacheB1Buf(cachePosB, b1.GetBufferHandle()); + SetCacheB1(cachePosB, b1); SetCacheB1IsCaching(cachePosB, true); } else { #if __CCE_AICORE__ == 200 SetTransposeB(true); #endif DEBUG_CODE(++b1LoadCacheCount_); - b1.SetAddr(var.qidB1_.GetBufferAddr(GetCacheB1Buf(cachePosB))); + b1 = GetCacheB1(cachePosB); } } else { int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_; @@ -7891,14 +7893,14 @@ MatmulImpl::LoadToBL1MDL(int r if (insertDeQue) { var.qidB1_.DeQue(); } - SetCacheB1Buf(cachePosKb, b1.GetBufferHandle()); + SetCacheB1(cachePosKb, b1); SetCacheB1IsCaching(cachePosKb, true); } else { #if __CCE_AICORE__ == 200 SetTransposeB(true); #endif DEBUG_CODE(++b1LoadCacheCount_); - b1.SetAddr(var.qidB1_.GetBufferAddr(GetCacheB1Buf(cachePosKb))); + b1 = GetCacheB1(cachePosKb); } } return b1; @@ -8158,11 +8160,11 @@ __aicore__ inline void MatmulImpl l1TmpForQuant; - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || (IsSameType::value && (IsSameType::value || IsSameType::value)) || ((IsSameType::value || IsSameType::value) && IsSameType::value)) { if (var.quantMode_ % 2 == 0) { @@ -10005,7 +10008,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantPre = QuantMode_t::DEQF16; fixpipeParams.deqScalar = var.quantScalar_; @@ -10045,7 +10049,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantParams = {QuantMode_t::DEQF16, var.quantScalar_}; Fixpipe(gm[dstOffset], co1Local, fixpipeParams); @@ -10090,7 +10095,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantPre = QuantMode_t::DEQF16; fixpipeParams.deqScalar = var.quantScalar_; @@ -10131,7 +10137,8 @@ __aicore__ inline void MatmulImpl::value && IsSameType::value) { + if constexpr ((IsSameType::value || IsSameType::value) && + IsSameType::value) { if (var.quantMode_ == 1) { fixpipeParams.quantParams = {QuantMode_t::DEQF16, var.quantScalar_}; Fixpipe(gm[dstOffset], co1Local, fixpipeParams); diff --git a/impl/matmul/matmul_macro_def.h b/impl/matmul/matmul_macro_def.h new file mode 100644 index 00000000..bf86e3c4 --- /dev/null +++ b/impl/matmul/matmul_macro_def.h @@ -0,0 +1,89 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_macro_def.h +* \brief +*/ +#ifndef IMPL_MATMUL_MATMUL_MACRO_DEF_H +#define IMPL_MATMUL_MATMUL_MACRO_DEF_H +#include "matmul_utils.h" +#include "matmul_macro_v220_impl.h" +#include "matmul_macro_v220_basic_impl.h" +#include "matmul_macro_v200_impl.h" +#include "modules/matmul_param.h" + +namespace matmul { + +/* ************************************************************************************************** + * MatmulMacroImpl * + * ************************************************************************************************* */ +template +struct MatmulMacroImpl { + __aicore__ inline MatmulMacroImpl() {}; +}; + +#if __CCE_AICORE__ >= 220 +// CFG_NORM +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul; +}; +// CFG_MDL +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul; +}; +// CFG_IBSHARE_NORM +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : + ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); + using PARAMS = MacroMatmul; +}; +#elif __CCE_AICORE__ == 200 +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulV200; +}; +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulV200; +}; +#endif + +// MM_CFG_BB +template +struct MatmulMacroImpl { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulMacroImpl() {}; + using PARAMS = MacroMatmulBasic; +}; + +} +#endif // _MATMUL_MACRO_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h index caf8a3de..327cc390 100644 --- a/impl/matmul/matmul_server.h +++ b/impl/matmul/matmul_server.h @@ -601,7 +601,8 @@ public: } else { ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg."); } - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value) || (IsSameType::value && (IsSameType::value || @@ -706,7 +707,8 @@ public: if constexpr (A_TYPE::layout != LayoutMode::NONE) { return true; } - if constexpr ((IsSameType::value && IsSameType::value) || + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || ((IsSameType::value || IsSameType::value) && IsSameType::value) || (IsSameType::value && (IsSameType::value || diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index 61ad6fc3..c9871710 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -1450,6 +1450,7 @@ void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const biasLength = max(quantLength, biasLength); } } + transLength = max(max(a1Length, b1Length), max(c1Length, biasLength)); } @@ -2278,7 +2279,12 @@ int64_t MatmulTilingAlgorithm::Process() tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE); tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE); const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BYTE_TAB.at(tilingIns_->aType_.dataType); - tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); + // int4 baseK should be 64 align + if ((tilingIns_->aType_.dataType == DataType::DT_INT4) && (singleCoreStatus.l0Status.kL0 % NUM_TWO != 0)) { + tilingIns_->tiling_.set_baseK((singleCoreStatus.l0Status.kL0 + 1) * reduceSize); + } else { + tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize); + } tilingIns_->baseM = tilingIns_->tiling_.get_baseM(); tilingIns_->baseN = tilingIns_->tiling_.get_baseN(); tilingIns_->baseK = tilingIns_->tiling_.get_baseK(); @@ -2343,4 +2349,4 @@ int64_t MatmulTilingAlgorithm::Process() const bool ans = CheckFinaleParams(coreStatus); return ans ? 0 : -1; } -} // namespace matmul_tiling +} // namespace matmul_tiling \ No newline at end of file diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index 9377f7b7..20703311 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -17,26 +17,17 @@ #include #include "lib/matmul/tiling.h" -#include "../../impl/matmul/matmul_macro_v220_impl.h" -#include "../../impl//matmul/matmul_macro_v220_basic_impl.h" -#include "../../impl//matmul/matmul_macro_v200_impl.h" #include "../../impl/matmul/matmul_utils.h" #include "../../impl/matmul/matmul_call_back.h" +#include "../../impl/matmul/modules/matmul_module.h" +#include "../../impl/matmul/modules/matmul_param.h" +#include "../../impl/matmul/modules/iterator/matmul_iterate_controller.h" +#include "../../impl/matmul/modules/feature_trait/matmul_feature_trait.h" +#include "../../impl/matmul/matmul_macro_def.h" namespace matmul { using namespace AscendC; -template -struct MatmulType { - constexpr static TPosition pos = POSITION; - constexpr static CubeFormat format = FORMAT; - using T = TYPE; - constexpr static bool isTrans = ISTRANS; - constexpr static LayoutMode layout = LAYOUT; - constexpr static bool ibShare = IBSHARE; -}; - template struct MatmulApiConfig { using AType = A_TYPE; @@ -46,511 +37,18 @@ struct MatmulApiConfig { constexpr static MatmulConfig Config = MM_CFG; }; -/* ************************************************************************************************** - * MatmulParamsBase * - * ************************************************************************************************* */ -template -struct MatmulParamsBase { - __aicore__ inline MatmulParamsBase() {}; -}; - -template -struct MatmulParamsNorm : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulParamsNorm() {}; - using SrcT = typename A_TYPE::T; - using SrcBT = typename B_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - TQue qidBias_; - typename L0cType::BUFFER CO1_; -#if __CCE_AICORE__ < 220 - TQue qidA2_; - TQue qidB2_; - TQue qidVecIn_; - TQue qidCO2_; - - typename QidType::QUE qidA1_; - typename QidType::QUE qidB1_; - typename QidType::QUE qidA1Cache_; - typename QidType::QUE qidB1Cache_; -#else - TQue qidA1_; - TQue qidB1_; - TQue qidA1Cache_; - TQue qidB1Cache_; -#endif - - LocalTensor cMatrix_; - - LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ - LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ - LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ - - SrcT aScalar_; - SrcT bScalar_; - DEBUG_CODE(int calCount_ = 0); - - TBuffAddr leftMatrix_; - TBuffAddr rightMatrix_; - TBuffAddr inputBias_; - - __gm__ SrcT* aGlobal_; - __gm__ SrcBT* bGlobal_; - __gm__ BiasT* biasGlobal_; - - TPipe* tpipe_; - const TCubeTiling* __restrict tiling_; - __gm__ uint8_t* cacheWorkspaceAddr; - -#if __CCE_AICORE__ < 220 - __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; - LocalTensor localWorkspace; - int nd2nz0ffset = 0; - int transOffset = 0; - int co2Offset = 0; -#endif - - int singleCoreM_; - int singleCoreN_; - int singleCoreK_; - // iterate nums in mnk axis - int mIter_; - int nIter_; - int kIter_; - - // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases - // measured in element - int baseUseM_; - int baseUseK_; - int baseUseN_; - // measured in cube block - int blockUseM_; - int blockUseK_; - int blockUseN_; - - int32_t cacheProcA_, cacheProcB_; - bool isFirstIter_; - bool isTransposeA_; // whether A matrix need to transpose - bool isTransposeB_; // whether B matrix need to transpose - // whether enbale bias, default value is false - bool enableBias_; - - int tailM_, tailK_, tailN_; - // current c matrix coordinate - int curM_, curN_; - // current c matrix step size, there could be tail steps - int curStepM_, curStepN_; - // current c matrix step block coordinate - int stepMIdx_, stepNIdx_; - - bool enHF32Mode_; - int32_t hf32TransMode_; - uint8_t subBlockIdx_; - - int baseMK_; - int baseKN_; - int baseMN_; - - int cacheA1Size_, cacheB1Size_; - int depthA1_, depthB1_; -#if __CCE_AICORE__ >= 220 - int sMadMStep_ = 0; - int sMadNStep_ = 0; -#endif - uint64_t dataPtr_; - uint64_t tilingPtr_; -}; - -template -struct MatmulParamsNormQuant : public MatmulParamsNorm { - __aicore__ inline MatmulParamsNormQuant() {}; - TQue qidFixPipe_; - uint64_t quantScalar_ = 0; - GlobalTensor quantTensor_; - // 0: no quant, 1: deqf16, 2: vdeqf16, 3: QF322B8_PRE, 4: VQF322B8_PRE, 5: REQ8(s32->u8/s8), 6: VREQ8(s32->u8/s8) - uint8_t quantMode_ = 0; -}; - -template -struct MatmulParamsMDL : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulParamsMDL() {}; - using SrcT = typename A_TYPE::T; - using SrcBT = typename B_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - - TQue qidBias_; - TQue qidFixPipe_; - typename L0cType::BUFFER CO1_; - TQue qidA1_; - TQue qidB1_; -#if __CCE_AICORE__ < 220 - TQue qidA2_; - TQue qidB2_; - TQue qidVecIn_; - TQue qidCO2_; - - typename QidType::QUE qidA12UBCache_; - typename QidType::QUE qidB12UBCache_; -#endif - - LocalTensor cMatrix_; - - TBufHandle cacheA1BufPing_; - TBufHandle cacheA1BufPong_; - TBufHandle cacheB1BufPing_; - TBufHandle cacheB1BufPong_; - bool cacheA1IsCachingPing_; - bool cacheA1IsCachingPong_; - bool cacheB1IsCachingPing_; - bool cacheB1IsCachingPong_; - - DEBUG_CODE(int calCount_ = 0); - - TBuffAddr leftMatrix_; - TBuffAddr rightMatrix_; - TBuffAddr inputBias_; - - __gm__ SrcT* aGlobal_; - __gm__ SrcBT* bGlobal_; - __gm__ BiasT* biasGlobal_; - - TPipe* tpipe_; - const TCubeTiling* __restrict tiling_; - __gm__ uint8_t* cacheWorkspaceAddr; - -#if __CCE_AICORE__ < 220 - __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; - LocalTensor localWorkspace; - LocalTensor cacheHeadA12UB_; // Allocate and release using qidA12UBCache_ - LocalTensor cacheHeadB12UB_; // Allocate and release using qidB12UBCache_ - int nd2nz0ffset = 0; - int transOffset = 0; - int co2Offset = 0; - int32_t cacheA12UBProcA_ = 0; - int32_t cacheB12UBProcB_ = 0; -#endif - - int singleCoreM_; - int singleCoreN_; - int singleCoreK_; - // iterate nums in mnk axis - int mIter_; - int nIter_; - int kIter_; - // iterate nums in mn step axis - int mStepIter_; - int nStepIter_; - int kaStepIter_; - int kbStepIter_; - int kStepIter_; - int minStepK_; - int kaStepFactor_; - int kbStepFactor_; - - // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases - // in unit of element - int baseUseM_; - int baseUseK_; - int baseUseN_; - // in unit of cube block - int blockUseM_; - int blockUseK_; - int blockUseN_; - - // in unit of element - int baseUseStepM_; - int baseUseStepN_; - int baseUseStepKa_; - int baseUseStepKb_; - // in unit of cube block - int blockUseStepM_; - int blockUseStepN_; - int blockUseStepKa_; - int blockUseStepKb_; - - bool isFirstIter_; - bool isTransposeA_; // whether A matrix need to transpose - bool isTransposeB_; // whether B matrix need to transpose - // whether enbale bias, default value is false - bool enableBias_; - - // in unit of element - int tailM_, tailK_, tailN_; - // in unit of element - int tailStepM_, tailStepN_, tailStepKa_, tailStepKb_; - // current c matrix coordinate, in unit of baseMN - int curM_, curN_; - // current c matrix step size, in unit of baseMNK , there could be tail steps - int curStepM_, curStepN_; - // current c matrix step block coordinate, in unit of stepMNK - int stepMIdx_, stepNIdx_, stepKaIdx_, stepKbIdx_; - - // stepKa == kIter - bool isA1KFullLoad_, isB1KFullLoad_; - - bool enHF32Mode_; - int32_t hf32TransMode_; - uint8_t subBlockIdx_; - - int baseMK_; - int baseKN_; - int baseMN_; - int cacheA1Factor_, cacheB1Factor_; - uint64_t quantScalar_ = 0; - uint64_t dataPtr_; - uint64_t tilingPtr_; - GlobalTensor quantTensor_; - // 0: no quant, 1: deqf16, 2: vdeqf16; - uint8_t quantMode_ = 0; - // anti quant param. - SrcT antiQuantOffsetScalar_; - SrcT antiQuantScaleScalar_; - LocalTensor antiQuantOffsetTensor_; - LocalTensor antiQuantScaleTensor_; -}; - -template -struct MatmulParamsBasicBlock : public MatmulParamsNorm { - __aicore__ inline MatmulParamsBasicBlock() {}; -}; - -template -struct MatmulParamsIBShareNorm : public MatmulParamsBase { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulParamsIBShareNorm() {}; - using SrcT = typename A_TYPE::T; - using DstT = typename C_TYPE::T; - using BiasT = typename BIAS_TYPE::T; - TQue qidBias_; - typename L0cType::BUFFER CO1_; - - TQue qidA2_; - TQue qidB2_; - TQue qidVecIn_; - TQue qidCO2_; - - typename QidType::QUE qidA1_; - typename QidType::QUE qidA1Cache_; - typename QidType::QUE qidB1_; - typename QidType::QUE qidB1Cache_; - - LocalTensor cMatrix_; - - LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ - LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ - LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ - - SrcT aScalar_; - SrcT bScalar_; - DEBUG_CODE(int calCount_ = 0); - - TBuffAddr leftMatrix_; - TBuffAddr rightMatrix_; - TBuffAddr inputBias_; - - __gm__ SrcT* aGlobal_; - __gm__ SrcT* bGlobal_; - __gm__ BiasT* biasGlobal_; - - TPipe* tpipe_; - const TCubeTiling* __restrict tiling_; - __gm__ uint8_t* cacheWorkspaceAddr; - - int singleCoreM_; - int singleCoreN_; - int singleCoreK_; - // iterate nums in mnk axis - int mIter_; - int nIter_; - int kIter_; - - // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases - // measured in element - int baseUseM_; - int baseUseK_; - int baseUseN_; - // measured in cube block - int blockUseM_; - int blockUseK_; - int blockUseN_; - - int32_t cacheProcA_, cacheProcB_; - bool isFirstIter_; - bool isTransposeA_; // whether A matrix need to transpose - bool isTransposeB_; // whether B matrix need to transpose - // whether enbale bias, default value is false - bool enableBias_; - - int tailM_, tailK_, tailN_; - // current c matrix coordinate - int curM_, curN_; - // current c matrix step size, there could be tail steps - int curStepM_, curStepN_; - // current c matrix step block coordinate - int stepMIdx_, stepNIdx_; - - bool enHF32Mode_; - int32_t hf32TransMode_; - uint8_t subBlockIdx_; - - int baseMK_; - int baseKN_; - int baseMN_; - - int cacheA1Size_, cacheB1Size_; - int depthA1_, depthB1_; - uint64_t dataPtr_; - uint64_t tilingPtr_; - - int curCacheIdx_; - GlobalCache gL1GroupCache0_; - GlobalCache gL1GroupCache1_; -}; - -/* ************************************************************************************************** - * MatmulParams * - * ************************************************************************************************* */ -template -struct MatmulParams { - __aicore__ inline MatmulParams(){}; -}; - -// CFG_NORM -#if __CCE_AICORE__ >= 220 -template -struct MatmulParams::value && IsSameType::value) || - (IsSameType::value && - (IsSameType::value || - IsSameType::value)))>::type> { - __aicore__ inline MatmulParams(){}; - using PARAMS = MatmulParamsNorm; -}; -#else -template -struct MatmulParams::value && IsSameType::value) || - (IsSameType::value && IsSameType::value))>::type> { - __aicore__ inline MatmulParams(){}; - using PARAMS = MatmulParamsNorm; -}; -#endif - -#if __CCE_AICORE__ >= 220 -template -struct MatmulParams::value && - IsSameType::value) || - (IsSameType::value && - (IsSameType::value || - IsSameType::value)))>::type> { - __aicore__ inline MatmulParams(){}; - using PARAMS = MatmulParamsNormQuant; -}; -#else -template -struct MatmulParams::value && IsSameType::value) || - (IsSameType::value && IsSameType::value))>::type> { - __aicore__ inline MatmulParams(){}; - using PARAMS = MatmulParamsNormQuant; -}; -#endif - -// CFG_MDL -template -struct MatmulParams { - __aicore__ inline MatmulParams() {}; - using PARAMS = MatmulParamsMDL; -}; - -// MM_CFG_BB -template -struct MatmulParams { - __aicore__ inline MatmulParams() {}; - using PARAMS = MatmulParamsBasicBlock; -}; - -// CFG_IBSHARE_NORM -template -struct MatmulParams { - __aicore__ inline MatmulParams() {}; - using PARAMS = MatmulParamsIBShareNorm; -}; - -/* ************************************************************************************************** - * MatmulMacroImpl * - * ************************************************************************************************* */ -template -struct MatmulMacroImpl { - __aicore__ inline MatmulMacroImpl() {}; -}; - -#if __CCE_AICORE__ >= 220 -// CFG_NORM -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -// CFG_MDL -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -// CFG_IBSHARE_NORM -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 : - ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0); - using PARAMS = MacroMatmul; -}; -#elif __CCE_AICORE__ == 200 -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulV200; -}; -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulV200; -}; -#endif - -// MM_CFG_BB -template -struct MatmulMacroImpl { - using L0cT = typename GetDstType::Type; - __aicore__ inline MatmulMacroImpl() {}; - using PARAMS = MacroMatmulBasic; -}; - template > -class MatmulImpl { +class MatmulImpl +: MATMUL_IMPORT_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait::iterCtrlCfg) +, MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE) +{ +public: + using AType = A_TYPE; + using BType = B_TYPE; + using CType = C_TYPE; + using BiasType = BIAS_TYPE; +private: using L0cT = typename GetDstType::Type; using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; @@ -697,6 +195,12 @@ public: uint32_t b1LoadCacheCount_ = 0; #endif +public: + using ENTITY = MatmulImpl; + + MATMUL_USE_IMPORTED_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait::iterCtrlCfg); + MATMUL_USE_IMPORTED_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE); + private: template friend __aicore__ inline void SetTPipe( @@ -833,12 +337,12 @@ private: const int row, const int col, const int height, const int width, const int gCol); // do ping when isPong = flase, do pong when isPong = true - __aicore__ inline TBufHandle GetCacheA1Buf(bool isPong); - __aicore__ inline TBufHandle GetCacheB1Buf(bool isPong); - __aicore__ inline bool GetCacheA1IsCaching(bool isPong); - __aicore__ inline bool GetCacheB1IsCaching(bool isPong); - __aicore__ inline void SetCacheA1Buf(bool isPong, TBufHandle buf); - __aicore__ inline void SetCacheB1Buf(bool isPong, TBufHandle buf); + __aicore__ inline auto& GetCacheA1(bool isPong); + __aicore__ inline auto& GetCacheB1(bool isPong); + __aicore__ inline bool GetCacheA1IsCaching(bool isPong) const; + __aicore__ inline bool GetCacheB1IsCaching(bool isPong) const; + __aicore__ inline void SetCacheA1(bool isPong, const LocalTensor& a1); + __aicore__ inline void SetCacheB1(bool isPong, const LocalTensor& b1); __aicore__ inline void SetCacheA1IsCaching(bool isPong, bool isCaching); __aicore__ inline void SetCacheB1IsCaching(bool isPong, bool isCaching); diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h index 609f1753..7e42d91f 100644 --- a/lib/matmul/tiling.h +++ b/lib/matmul/tiling.h @@ -128,6 +128,48 @@ struct MatmulConfig { bool enableDoubleCache; }; +enum class MatmulConfigMode { + CONFIG_NORM, + CONFIG_MDL, + CONFIG_SPECIALMDL, + CONFIG_IBSHARE +}; + +struct MatmulShapeParams { + uint32_t singleCoreM; + uint32_t singleCoreN; + uint32_t singleCoreK; + uint32_t basicM; + uint32_t basicN; + uint32_t basicK; +}; + +struct MatmulQuantParams { + bool isPerTensor; + bool hasAntiQuantOffset; +}; + +struct MatmulBatchParams { + bool isNBatch; + BatchMode batchMode; +}; + +struct MatmulFuncParams { + bool intrinsicsCheck; + bool enVecND2NZ; + uint32_t doMTE2Preload; + bool enableQuantVector = true; + bool enableSetDefineData = true; + uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT; + bool enableReuse = true; + bool enableUBReuse; + bool enableL1CacheUB; + bool intraBlockPartSum = false; + IterateOrder iterateOrder; + ScheduleType scheduleType; + bool enableDoubleCache; +}; + __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false, const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1, const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF, @@ -424,6 +466,58 @@ constexpr MatmulConfig CFG_MDL = GetMDLConfig(); constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128); constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig(); +template +__aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) { + MatmulConfig mmConfig = CFG_NORM; + if constexpr (configMode == MatmulConfigMode::CONFIG_MDL) { + mmConfig = CFG_MDL; + } else if constexpr (configMode == MatmulConfigMode::CONFIG_SPECIALMDL) { + mmConfig = GetSpecialMDLConfig(); + } else if constexpr (configMode == MatmulConfigMode::CONFIG_IBSHARE) { + mmConfig = CFG_IBSHARE_NORM; + } + GetMMConfigImpl(mmConfig, args...); + return mmConfig; +} + +template +__aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, T arg, ArgTypes&&... args) { + GetMMConfigImpl(cfg, arg); + GetMMConfigImpl(cfg, args...); +} + +template +__aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, ArgType arg) { + if constexpr (AscendC::IsSameType::value) { + cfg.singleCoreM = arg.singleCoreM; + cfg.singleCoreN = arg.singleCoreN; + cfg.singleCoreK = arg.singleCoreK; + cfg.basicM = arg.basicM; + cfg.basicN = arg.basicN; + cfg.basicK = arg.basicK; + } else if constexpr (AscendC::IsSameType::value) { + cfg.isPerTensor = arg.isPerTensor; + cfg.hasAntiQuantOffset = arg.hasAntiQuantOffset; + } else if constexpr (AscendC::IsSameType::value) { + cfg.isNBatch = arg.isNBatch; + cfg.batchMode = arg.batchMode; + } else if constexpr (AscendC::IsSameType::value) { + cfg.intrinsicsCheck = arg.intrinsicsCheck; + cfg.enVecND2NZ = arg.enVecND2NZ; + cfg.doMTE2Preload = arg.doMTE2Preload; + cfg.enableQuantVector = arg.enableQuantVector; + cfg.enableSetDefineData = arg.enableSetDefineData; + cfg.iterateMode = arg.iterateMode; + cfg.enableReuse = arg.enableReuse; + cfg.enableUBReuse = arg.enableUBReuse; + cfg.enableL1CacheUB = arg.enableL1CacheUB; + cfg.intraBlockPartSum = arg.intraBlockPartSum; + cfg.iterateOrder = arg.iterateOrder; + cfg.scheduleType = arg.scheduleType; + cfg.enableDoubleCache = arg.enableDoubleCache; + } +} + struct MatrixOffset { int32_t offset; int32_t row, col; -- Gitee From eb2fbd0b807b706500c81fe25ee61aaf2f176047 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 04:16:58 +0000 Subject: [PATCH 2/8] add llt case adapt code hub Signed-off-by: jiangchengcheng-on --- tests/CMakeLists.txt | 2 + tests/main_global.cpp | 2 +- tests/matmul/test_matmul_config.cpp | 61 + tests/matmul/test_matmul_input_l1_cache.cpp | 73 + .../matmul/test_matmul_iterate_controller.cpp | 220 ++ tests/matmul/test_matmul_l0c_buffer.cpp | 106 + tests/tiling/test_tiling.cpp | 2086 ++++++++++++++++- 7 files changed, 2465 insertions(+), 85 deletions(-) create mode 100644 tests/matmul/test_matmul_config.cpp create mode 100644 tests/matmul/test_matmul_input_l1_cache.cpp create mode 100644 tests/matmul/test_matmul_iterate_controller.cpp create mode 100644 tests/matmul/test_matmul_l0c_buffer.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6387a70c..57d343e0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -49,6 +49,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_quant.cpp ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_quant_per_channel.cpp # ${ASCENDC_TESTS_DIR}/quantization/dequant/test_operator_dequant_v200.cpp + ${ASCENDC_TESTS_DIR}/matmul/test_matmul_config.cpp ) # ascend910B1 aiv test cases @@ -81,6 +82,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES # ascend910B1 aic test cases file(GLOB ASCENDC_TEST_ascend910B1_AIC_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/matmul/test_operator_matmul_v220.cpp + ${ASCENDC_TESTS_DIR}/matmul/test_matmul_config.cpp ) # ascend310B1 test cases diff --git a/tests/main_global.cpp b/tests/main_global.cpp index 167def52..16b1b816 100644 --- a/tests/main_global.cpp +++ b/tests/main_global.cpp @@ -33,6 +33,6 @@ __aicore__ AscendC::TPipe* GetTPipePtr() #else return g_tPipePtr; #endif - ASSERT(false && "Only supported ascend910B1, ascend910, ascend310P"); + ASSERT(false && "Only supported ascend910B1, ascend910, ascend310p"); return nullptr; } \ No newline at end of file diff --git a/tests/matmul/test_matmul_config.cpp b/tests/matmul/test_matmul_config.cpp new file mode 100644 index 00000000..33dce3b3 --- /dev/null +++ b/tests/matmul/test_matmul_config.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" +#include "impl/matmul/modules/input_cache/matmul_input_l1_cache.h" + +using namespace std; +using namespace AscendC; +using namespace matmul; + +#include "matmul_module_test_def.h" + +class TestMatmulConfig : public testing::Test { +protected: + static void SetUpTestCase() {} + static void TearDownTestCase() {} + virtual void SetUp() {} + void TearDown() {} +}; + +TEST_F(TestMatmulConfig, TestParamsConfig) +{ + constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; + constexpr static MatmulShapeParams shapeParams{128, 128, 128, 64, 64, 64}; + constexpr static MatmulQuantParams quantParams{1, 1}; + constexpr static MatmulBatchParams batchParams{1, BatchMode::BATCH_LARGE_THAN_L1}; + constexpr static MatmulFuncParams funcParams{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + IterateOrder::ORDER_N, ScheduleType::OUTER_PRODUCT, 1}; + constexpr MatmulConfig mmConfig = GetMMConfig(shapeParams, quantParams, batchParams, funcParams); + + EXPECT_EQ((uint32_t)mmConfig.doNorm, 1); + EXPECT_EQ((uint32_t)mmConfig.singleCoreM, 128); + EXPECT_EQ((uint32_t)mmConfig.singleCoreN, 128); + EXPECT_EQ((uint32_t)mmConfig.singleCoreK, 128); + EXPECT_EQ((uint32_t)mmConfig.basicM, 64); + EXPECT_EQ((uint32_t)mmConfig.basicN, 64); + EXPECT_EQ((uint32_t)mmConfig.basicK, 64); + EXPECT_EQ((uint32_t)mmConfig.isPerTensor, 1); + EXPECT_EQ((uint32_t)mmConfig.hasAntiQuantOffset, 1); + EXPECT_EQ((uint32_t)mmConfig.isNBatch, 1); + EXPECT_EQ((uint32_t)mmConfig.batchMode, 2); + EXPECT_EQ((uint32_t)mmConfig.intrinsicsCheck, 1); + EXPECT_EQ((uint32_t)mmConfig.enVecND2NZ, 1); + EXPECT_EQ((uint32_t)mmConfig.doMTE2Preload, 1); + EXPECT_EQ((uint32_t)mmConfig.enableQuantVector, 1); + EXPECT_EQ((uint32_t)mmConfig.enableSetDefineData, 1); + EXPECT_EQ((uint32_t)mmConfig.iterateMode, 1); + EXPECT_EQ((uint32_t)mmConfig.enableReuse, 1); + EXPECT_EQ((uint32_t)mmConfig.enableUBReuse, 1); + EXPECT_EQ((uint32_t)mmConfig.enableL1CacheUB, 1); + EXPECT_EQ((uint32_t)mmConfig.intraBlockPartSum, 1); + EXPECT_EQ((uint32_t)mmConfig.iterateOrder, 1); + EXPECT_EQ((uint32_t)mmConfig.scheduleType, 1); + EXPECT_EQ((uint32_t)mmConfig.enableDoubleCache, 1); +} \ No newline at end of file diff --git a/tests/matmul/test_matmul_input_l1_cache.cpp b/tests/matmul/test_matmul_input_l1_cache.cpp new file mode 100644 index 00000000..5f406f60 --- /dev/null +++ b/tests/matmul/test_matmul_input_l1_cache.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" +#include "impl/matmul/modules/input_cache/matmul_input_l1_cache.h" + +using namespace std; +using namespace AscendC; +using namespace matmul; + +#include "matmul_module_test_def.h" + +namespace { + +template +class MatmulImpl +: MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE) { +public: + using VAR_PARAMS = + typename MatmulParams::PARAMS; + + MatmulImpl() { + InitVar(); + } + + VAR_PARAMS& GetVar() { + return var; + } + + void AllocATensor() { + var.cacheHeadA1_ = var.qidA1Cache_.template AllocTensor(); + } + + void AllocBTensor() { + var.cacheHeadB1_ = var.qidB1Cache_.template AllocTensor(); + } + + void InitVar() { + var.tiling_ = &tiling; + var.tpipe_ = &pipe; + } + +private: + TCubeTiling tiling; + TPipe pipe; + VAR_PARAMS var; +}; +} + +class test_matmul_input_l1_cache : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} + +private: + using A_TYPE = matmul::MatmulType; + using B_TYPE = matmul::MatmulType; + using C_TYPE = matmul::MatmulType; + using BIAS_TYPE = matmul::MatmulType; + + MatmulImpl mm; +}; + +TEST_F(test_matmul_input_l1_cache, first_iter_order_M) { + mm.ClearAL1Cache(); + mm.ClearBL1Cache(); +} \ No newline at end of file diff --git a/tests/matmul/test_matmul_iterate_controller.cpp b/tests/matmul/test_matmul_iterate_controller.cpp new file mode 100644 index 00000000..0abf29d8 --- /dev/null +++ b/tests/matmul/test_matmul_iterate_controller.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/feature_trait/matmul_feature_trait.h" +#include "impl/matmul/modules/iterator/matmul_iterate_controller.h" + +using namespace std; +using namespace AscendC; +using namespace matmul; + +#include "matmul_module_test_def.h" + +using A_TYPE = matmul::MatmulType; +using B_TYPE = matmul::MatmulType; +using C_TYPE = matmul::MatmulType; +using BIAS_TYPE = matmul::MatmulType; + +template +class MatmulInputL1Cache { +public: + void ClearAL1Cache() { + clearedACount++; + } + + void ClearBL1Cache() { + clearedBCount++; + } + +public: + uint32_t clearedACount {0}; + uint32_t clearedBCount {0}; +}; + +namespace { +template +class MatmulImpl +: MATMUL_IMPORT_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait::iterCtrlCfg)//MatmulFeatureTrait::iterCtrlCfg) +, MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE) { + using VAR_PARAMS = + typename MatmulParams::PARAMS; + +public: + MatmulImpl() { + InitVar(); + } + + void SetTiling(IterateOrder order, int32_t stepM, uint32_t stepN) { + tiling.iterateOrder = static_cast(order); + tiling.stepM = stepM; + tiling.stepN = stepN; + + this->Reset(); + } + + void SetMParams(int32_t curPos, int32_t iter, int32_t stepIdx, int32_t curStep) { + var.curM_ = curPos; + var.mIter_ = iter; + var.stepMIdx_ = stepIdx; + var.curStepM_ = curStep; + } + + void SetNParams(int32_t curPos, int32_t iter, int32_t stepIdx, int32_t curStep) { + var.curN_ = curPos; + var.nIter_ = iter; + var.stepNIdx_ = stepIdx; + var.curStepN_ = curStep; + } + + VAR_PARAMS& GetVar() { + return var; + } + + void AllocATensor() { + var.cacheHeadA1_ = var.qidA1Cache_.template AllocTensor(); + } + + void AllocBTensor() { + var.cacheHeadB1_ = var.qidB1Cache_.template AllocTensor(); + } + + void InitVar() { + var.tiling_ = &tiling; + var.tpipe_ = &pipe; + } + +private: + TCubeTiling tiling; + TPipe pipe; + VAR_PARAMS var; +}; +} + +class test_matmul_iterator_controller : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} + +private: + MatmulImpl mm; +}; + +TEST_F(test_matmul_iterator_controller, first_iter_order_M) { + mm.SetTiling(IterateOrder::ORDER_M, 4, 2); + mm.SetMParams(0, 4, 0, 0); + mm.SetNParams(0, 2, 0, 0); + + bool isFinished = mm.MoveNext(); + + ASSERT_TRUE(isFinished); + + ASSERT_EQ(mm.GetVar().curStepM_, 4); + ASSERT_EQ(mm.GetVar().curM_, 0); + ASSERT_EQ(mm.GetVar().curN_, 0); +} + +TEST_F(test_matmul_iterator_controller, first_iter_order_N) { + mm.SetTiling(IterateOrder::ORDER_N, 4, 2); + mm.SetMParams(0, 4, 0, 0); + mm.SetNParams(0, 2, 0, 0); + + bool isFinished = mm.MoveNext(); + + ASSERT_TRUE(isFinished); + + ASSERT_EQ(mm.GetVar().curStepN_, 2); + ASSERT_EQ(mm.GetVar().curN_, 0); +} + +TEST_F(test_matmul_iterator_controller, order_M_iter_four_times) { + mm.SetTiling(IterateOrder::ORDER_M, 4, 2); + mm.SetMParams(0, 4, 0, 0); + mm.SetNParams(0, 2, 0, 0); + int32_t cnt = 0; + while(mm.MoveNext()) { + cnt++; + } + + ASSERT_EQ(cnt, 8); +} + +TEST_F(test_matmul_iterator_controller, order_N_iter_four_times) { + mm.SetTiling(IterateOrder::ORDER_N, 4, 2); + mm.SetMParams(0, 4, 0, 0); + mm.SetNParams(0, 2, 0, 0); + int32_t cnt = 0; + while(mm.MoveNext()) { + cnt++; + } + + ASSERT_EQ(cnt, 8); +} + + +TEST_F(test_matmul_iterator_controller, order_M_iter_twice) { + mm.SetTiling(IterateOrder::ORDER_M, 4, 2); + mm.SetMParams(0, 1, 0, 0); + mm.SetNParams(0, 2, 0, 0); + auto isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curN_, 0); + isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curN_, 1); + ASSERT_TRUE(isFinished); + isFinished = mm.MoveNext(); + ASSERT_FALSE(isFinished); + ASSERT_EQ(mm.GetVar().curM_, 0); +} + +TEST_F(test_matmul_iterator_controller, order_N_iter_twice) { + mm.SetTiling(IterateOrder::ORDER_N, 4, 2); + mm.SetMParams(0, 2, 0, 0); + mm.SetNParams(0, 1, 0, 0); + auto isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curM_, 0); + isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curM_, 1); + ASSERT_TRUE(isFinished); + isFinished = mm.MoveNext(); + ASSERT_FALSE(isFinished); + ASSERT_EQ(mm.GetVar().curN_, 0); +} + +// test when n-dimension is finished in OrderM case +TEST_F(test_matmul_iterator_controller, order_M_n_is_finished) { + mm.SetTiling(IterateOrder::ORDER_M, 4, 2); + mm.SetMParams(0, 2, 0, 0); + mm.SetNParams(0, 2, 0, 0); + // first iter + auto isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curM_, 0); + // n-dimension is finished + isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curN_, 1); + ASSERT_TRUE(isFinished); + (void)mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curN_, 0); + ASSERT_EQ(mm.GetVar().curM_, 1); +} + +// test when m-dimension is finished in OrderN case +TEST_F(test_matmul_iterator_controller, order_N_m_is_finished) { + mm.SetTiling(IterateOrder::ORDER_N, 4, 2); + mm.SetMParams(0, 2, 0, 0); + mm.SetNParams(0, 2, 0, 0); + // first iter + auto isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curN_, 0); + // n-dimension is finished + isFinished = mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curM_, 1); + ASSERT_TRUE(isFinished); + (void)mm.MoveNext(); + ASSERT_EQ(mm.GetVar().curM_, 0); + ASSERT_EQ(mm.GetVar().curN_, 1); +} \ No newline at end of file diff --git a/tests/matmul/test_matmul_l0c_buffer.cpp b/tests/matmul/test_matmul_l0c_buffer.cpp new file mode 100644 index 00000000..67e8c242 --- /dev/null +++ b/tests/matmul/test_matmul_l0c_buffer.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * @brief load data instruction ut for ascend910B1 + * + */ +#include +#include "kernel_operator.h" +#include "lib/matmul/tiling.h" +#include "impl/matmul/modules/matmul_param.h" +#include "impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h" + +using namespace std; +using namespace AscendC; +using namespace matmul; + +#include "matmul_module_test_def.h" + +namespace { + +template +class MatmulImpl +: MATMUL_IMPORT_MODULE(MatmulL0CBuffer, EnUnitFlag(MM_CFG)) { + using VAR_PARAMS = + typename MatmulParams::PARAMS; + +public: + MatmulImpl() { + InitVar(); + } + + VAR_PARAMS& GetVar() { + return var; + } + + void SetParamKIter(int k) { + var.kIter_ = k; + } + + void InitVar() { + var.tiling_ = &tiling; + var.tpipe_ = &pipe; + var.baseMN_ = 1024; + } + +private: + TCubeTiling tiling; + TPipe pipe; + VAR_PARAMS var; +}; +} + +class test_matmul_l0c_buffer : public testing::Test { +protected: + void SetUp() {} + void TearDown() {} + +private: + using L0cT = float; + + using A_TYPE = matmul::MatmulType; + using B_TYPE = matmul::MatmulType; + using C_TYPE = matmul::MatmulType; + using BIAS_TYPE = matmul::MatmulType; + + MatmulImpl enUnitFlagMM; + MatmulImpl disUnitFlagMM; +}; + +TEST_F(test_matmul_l0c_buffer, enable_unit_flag) { + + enUnitFlagMM.InitL0CBuffer(); + auto co1Local = enUnitFlagMM.LoadL0CTensor(); + auto co1Local1 = enUnitFlagMM.AllocL0CLocalTensor(); + enUnitFlagMM.FreeL0CLocalTensor(co1Local1); + enUnitFlagMM.ResetL0CEventStatus(); + + enUnitFlagMM.SetParamKIter(1); + ASSERT_TRUE(enUnitFlagMM.IsL0CLastIter(0)); + + ASSERT_EQ(enUnitFlagMM.GetMmadUnitFlagCtrl(true), 3); + ASSERT_EQ(enUnitFlagMM.GetMmadUnitFlagCtrl(false), 2); + + FixpipeParams fixpipeParams; + enUnitFlagMM.SetFixpipeUnitFlag(fixpipeParams); + ASSERT_EQ(fixpipeParams.unitFlag, 3); +} + +TEST_F(test_matmul_l0c_buffer, disable_unit_flag) { + + disUnitFlagMM.InitL0CBuffer(); + disUnitFlagMM.GetVar().cMatrix_ = disUnitFlagMM.LoadL0CTensor(); + auto co1Local1 = disUnitFlagMM.AllocL0CLocalTensor(); + disUnitFlagMM.FreeL0CLocalTensor(co1Local1); + disUnitFlagMM.ResetL0CEventStatus(); + + disUnitFlagMM.SetParamKIter(1); + ASSERT_FALSE(disUnitFlagMM.IsL0CLastIter(0)); + + ASSERT_EQ(disUnitFlagMM.GetMmadUnitFlagCtrl(true), 0); + ASSERT_EQ(disUnitFlagMM.GetMmadUnitFlagCtrl(false), 0); + + FixpipeParams fixpipeParams; + disUnitFlagMM.SetFixpipeUnitFlag(fixpipeParams); + ASSERT_EQ(fixpipeParams.unitFlag, 0); +} \ No newline at end of file diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 219b8605..d9cfa951 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -77,6 +77,25 @@ TEST_F(TestTiling, PlatformConstructor) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, TestInt4BaseK) +{ + matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288, + .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536}; + MatmulApiTiling tiling(plat); + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); + tiling.SetBias(false); + tiling.SetShape(144, 256, 32); + tiling.SetOrgShape(144, 256, 32); + tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1); + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(tilingData.get_baseK() % 64, 0); + EXPECT_EQ(ret, 0); +} + TEST_F(TestTiling, Tiling_310p_NotAligned) { matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576, @@ -723,6 +742,105 @@ TEST_F(TestTiling, TestSetBufferSpace) EXPECT_EQ(tiling.bufferPool_.l1Size, 1024); } +TEST_F(TestTiling, TestCosTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto cosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 3); + AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 2); + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetCosTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestCosTilingFloat512) +{ + std::vector shapeDims = { 512 }; + auto cosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 3); + AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestCosTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto cosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetCosMaxMinTmpSize(cosShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 8 * 2); + EXPECT_EQ(minValue, 256 * 8); + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetCosTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 8); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAtanTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto atanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAtanMaxMinTmpSize(atanShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 5); + EXPECT_EQ(minValue, 256 * 5); +} + +TEST_F(TestTiling, TestAtanTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto atanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAtanMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 12); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 12); + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetAtanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 12); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestClampTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto atanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetClampMaxMinTmpSize(atanShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 1); + EXPECT_EQ(minValue, 64 * 1); +} + +TEST_F(TestTiling, TestClampTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto atanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetClampMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 1); + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + GetClampTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + TEST_F(TestTiling, TestSoftMaxTiling) { std::vector shapeDims = { 128, 128 }; @@ -836,7 +954,6 @@ TEST_F(TestTiling, TestSoftMaxFlashV2Tiling) SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); EXPECT_EQ(tilingData.get_reduceM(), 64); } - TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) { std::vector shapeDims = { 8, 1024 }; @@ -865,6 +982,163 @@ TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) EXPECT_EQ(tilingData.get_reduceM(), 8); } +TEST_F(TestTiling, TestAsinTmpBufferFacotrHalfWithoutBasicBlock) { + uint32_t maxLivedNodes = 0xffff; + uint32_t extraBuffer = 0xffff; + GetAsinTmpBufferFactorSize(2, maxLivedNodes, extraBuffer); + EXPECT_EQ(maxLivedNodes, 6); + EXPECT_EQ(extraBuffer, 0); +} + +TEST_F(TestTiling, TestAsinTmpBufferFacotrFloatWithoutBasicBlock) { + uint32_t maxLivedNodes = 0xffff; + uint32_t extraBuffer = 0xffff; + GetAsinTmpBufferFactorSize(4, maxLivedNodes, extraBuffer); + EXPECT_EQ(maxLivedNodes, 2); + EXPECT_EQ(extraBuffer, 0); +} + +TEST_F(TestTiling, TestAsinTilingHalf128) +{ + std::vector shapeDims = { 128 }; + auto asinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 256 * 6); + EXPECT_EQ(minValue, 256 * 6); +} + +TEST_F(TestTiling, TestAsinTilingFloat) +{ + std::vector shapeDims = { 32 }; + auto asinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 256 * 2); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestAsinTilingHalf16K) +{ + std::vector shapeDims = { 128, 128 }; + auto asinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 6 * 2); + EXPECT_EQ(minValue, 256 * 6); +} + +TEST_F(TestTiling, TestAsinTilingFloat16K) +{ + std::vector shapeDims = { 128, 128 }; + auto asinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestSinhTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto sinhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetSinhMaxMinTmpSize(sinhShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetSinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSinhTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto sinhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetSinhMaxMinTmpSize(sinhShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 4); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 2); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetSinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestRoundTiling) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + std::vector shapeDims = { 128, 128 }; + auto tanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + uint32_t maxLiveNodeCnt = 0; + uint32_t extraBuf = 0; + platform_ascendc::SocVersion socVersion = plat.GetSocVersion(); + GetRoundMaxMinTmpSize(plat, tanShape, 4, false, maxValue, minValue); + GetRoundTmpBufferFactorSize(plat, 4, maxLiveNodeCnt, extraBuf); + GetRoundMaxMinTmpSize(plat, tanShape, 2, false, maxValue, minValue); + GetRoundTmpBufferFactorSize(plat, 2, maxLiveNodeCnt, extraBuf); +} + +TEST_F(TestTiling, TestTanTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto tanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 4); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestTanTilingFloat512) +{ + std::vector shapeDims = { 512 }; + auto tanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 4); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestTanTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto tanShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTanMaxMinTmpSize(tanShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 10 * 2); + EXPECT_EQ(minValue, 256 * 10); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetTanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 10); + EXPECT_EQ(extraBuf, 0); +} + TEST_F(TestTiling, TEstSwiGLUTilingHalf) { std::vector shapeDims = {10, 512}; @@ -920,103 +1194,530 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf) EXPECT_EQ(extraBuf, 0); } -TEST_F(TestTiling, TestSigmoidTiling) +TEST_F(TestTiling, TestFmodTilingFloat) { - std::vector shapeDims = { 128 }; - auto sigmoidShape = ge::Shape(shapeDims); - uint32_t maxVal; - uint32_t minVal; - GetSigmoidMaxMinTmpSize(sigmoidShape, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 128 * 4); - EXPECT_EQ(minVal, 256); + std::vector shapeDims = { 128, 128 }; + auto fmodShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFmodMaxMinTmpSize(fmodShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 256); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); } -TEST_F(TestTiling, TestLayernormTiling) +TEST_F(TestTiling, TestFmodTilingHalf) { - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = 4; - - std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; - auto layernormShape = ge::Shape(shapeDims); - const bool isReuseSource = false; - optiling::LayerNormTiling tilling; - - uint32_t minValue = 0; + std::vector shapeDims = { 128, 128 }; + auto fmodShape = ge::Shape(shapeDims); uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFmodMaxMinTmpSize(fmodShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 128 * 3 * 4); + EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); +} - AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue); - EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize); - EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize); +TEST_F(TestTiling, TestTruncTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto truncShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTruncMaxMinTmpSize(truncShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); - AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling); - EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetTruncTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); } -TEST_F(TestTiling, TestRmsnormTiling) +TEST_F(TestTiling, TestTruncTilingHalf) { - constexpr uint32_t bLength = 4; - constexpr uint32_t sLength = 32; - constexpr uint32_t hLength = 16; - constexpr uint32_t bsLength = bLength * sLength; - constexpr uint32_t bshLength = bLength * sLength * hLength; - std::vector shapeDims = {bLength, sLength, hLength}; - auto shape = ge::Shape(shapeDims); - constexpr uint32_t typeSize = 4; - constexpr uint32_t ONE_BLK_FLOAT = 8; - + std::vector shapeDims = { 128, 128 }; + auto truncShape = ge::Shape(shapeDims); uint32_t maxValue = 0; uint32_t minValue = 0; - // common scene - bool res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue); - const uint32_t goldenMax = (bshLength + bsLength) * typeSize; - uint32_t goldenMin = (hLength + ONE_BLK_FLOAT) * typeSize; - EXPECT_EQ(res, true); - EXPECT_EQ(maxValue, goldenMax); - EXPECT_EQ(minValue, goldenMin); + GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); - // basic block scene 1: input shape is illegal, fail to get minSize - res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue, true); - EXPECT_EQ(res, false); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetTruncTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); +} - constexpr uint32_t BASIC_BLK_HLENGTH = 64; - constexpr uint32_t BASIC_BLK_BSLENGTH = 8; - shapeDims[2] = BASIC_BLK_HLENGTH; - auto shape_basic_blk = ge::Shape(shapeDims);// 4,32,64 - // basic block scene 2: get minSize successfully - res = AscendC::GetRmsNormMaxMinTmpSize(shape_basic_blk, typeSize, maxValue, minValue, true); - goldenMin = (64 + 8) * typeSize; - EXPECT_EQ(res, true); - EXPECT_EQ(minValue, goldenMin); +TEST_F(TestTiling, TestTruncTilingHalf512) +{ + std::vector shapeDims = { 512 }; + auto truncShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 2 * 2); + EXPECT_EQ(minValue, 256 * 2); +} - // basic block scene: get basic block using minTmpSize - // goldenMin should be (BASIC_BLK_HLENGTH(64) * BASIC_BLK_BSLENGTH(8) + bsLength) * typeSize - optiling::RmsNormTiling tiling; - uint32_t tmpSize = (64 + 8) * 4; // shape: 4,32,64 - res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, minValue, typeSize, tiling, true); - EXPECT_EQ(res, true); - EXPECT_EQ(tiling.get_mainBshLength(), 64); - EXPECT_EQ(tiling.get_mainBsLength(), 1); +TEST_F(TestTiling, TestAcosTmpBufferFacotrHalfWithoutBasicBlock) { + uint32_t maxLivedNodes = 0xffff; + uint32_t extraBuffer = 0xffff; + GetAcosTmpBufferFactorSize(2, maxLivedNodes, extraBuffer); + EXPECT_EQ(maxLivedNodes, 6); + EXPECT_EQ(extraBuffer, 0); +} - auto shape1 = ge::Shape({1,7,16}); - res = AscendC::GetRmsNormMaxMinTmpSize(shape1, typeSize, maxValue, minValue); - goldenMin = (8 + 16) * typeSize; - EXPECT_EQ(minValue, goldenMin); - uint32_t stackBufferSize = 100 * 1024; - // common scene: get tiling info successfully, shape: 1,7,16 - res = AscendC::GetRmsNormTilingInfo(shape1, shape1, stackBufferSize, typeSize, tiling); - EXPECT_EQ(res, true); - EXPECT_EQ(tiling.get_mainBshLength(), 1*7*16); - EXPECT_EQ(tiling.get_mainBsLength(), 7); +TEST_F(TestTiling, TestAcosTmpBufferFacotrFloatWithoutBasicBlock) { + uint32_t maxLivedNodes = 0xffff; + uint32_t extraBuffer = 0xffff; + GetAcosTmpBufferFactorSize(4, maxLivedNodes, extraBuffer); + EXPECT_EQ(maxLivedNodes, 2); + EXPECT_EQ(extraBuffer, 0); +} - stackBufferSize = hLength; - // common scene: fail to get tiling info because of small stack buffer - res = AscendC::GetRmsNormTilingInfo(shape, shape, stackBufferSize, typeSize, tiling); - EXPECT_EQ(res, false); +TEST_F(TestTiling, TestAcosTilingHalf128) +{ + std::vector shapeDims = { 128 }; + auto acosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 6); + EXPECT_EQ(maxValue, 256 * 6); +} - // basic block scene: get basic block tiling info successfully - stackBufferSize = 100 * 1024; // shape: 4,32,64 - res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, stackBufferSize, typeSize, tiling, true); +TEST_F(TestTiling, TestAcosTilingFloat) +{ + std::vector shapeDims = { 32 }; + auto acosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); + EXPECT_EQ(maxValue, 256 * 2); +} + +TEST_F(TestTiling, TestTanhTiling) +{ + uint32_t maxVal = 0; + uint32_t minVal = 0; + GetTanhMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 128 * 4 * 1); + EXPECT_EQ(minVal, 256 * 1); + GetTanhMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 4); + EXPECT_EQ(minVal, 256 * 4); + uint32_t extraBuf = 123; + uint32_t maxLivedNodesCnt = 123; + GetTanhTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf); + EXPECT_EQ(extraBuf, 0); + EXPECT_EQ(maxLivedNodesCnt, 1); + GetTanhTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf); + EXPECT_EQ(extraBuf, 0); + EXPECT_EQ(maxLivedNodesCnt, 4); +} + +TEST_F(TestTiling, TestSigmoidTiling) +{ + std::vector shapeDims = { 128 }; + auto sigmoidShape = ge::Shape(shapeDims); + uint32_t maxVal; + uint32_t minVal; + GetSigmoidMaxMinTmpSize(sigmoidShape, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 128 * 4); + EXPECT_EQ(minVal, 256); +} + +TEST_F(TestTiling, TestLogTilingMaxMin) +{ + std::vector shapeDims = { 128 }; + auto logShape = ge::Shape(shapeDims); + uint32_t maxVal; + uint32_t minVal; + GetLogMaxMinTmpSize(logShape, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetLog2MaxMinTmpSize(logShape, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetLog2MaxMinTmpSize(logShape, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 4 * 128); + EXPECT_EQ(minVal, 256); + GetLog10MaxMinTmpSize(logShape, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); +} + +TEST_F(TestTiling, TestLogTilingFactor) +{ + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetLogTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetLog10TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); + GetLog2TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); + GetLog2TmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestPowerTiling) +{ + std::vector shapeDims = { 512 }; + auto powerShape = ge::Shape(shapeDims); + uint32_t maxVal; + uint32_t minVal; + GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 4 + 256); + EXPECT_EQ(minVal, 256 * 4 + 256); + GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 6); + EXPECT_EQ(minVal, 256 * 6); + GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 2 * 14 + 256); + EXPECT_EQ(minVal, 256 * 7 + 256); + std::vector scalar_shape = { 1 }; + auto scalarShape = ge::Shape(scalar_shape); + GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 2 * 14 + 256); + EXPECT_EQ(minVal, 256 * 7 + 256); + GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 7); + EXPECT_EQ(minVal, 256 * 7); + GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 512 * 4 * 5 + 256); + EXPECT_EQ(minVal, 256 * 5 + 256); + + std::vector shape1 = { 16 }; + auto powerShape1 = ge::Shape( shape1 ); + GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 5 + 256); + EXPECT_EQ(minVal, 256 * 5 + 256); + GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 7 + 256); + EXPECT_EQ(minVal, 256 * 7 + 256); + GetPowerMaxMinTmpSize(powerShape1, scalarShape, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 7); + EXPECT_EQ(minVal, 256 * 7); + GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 4 + 256); + EXPECT_EQ(minVal, 256 * 4 + 256); + GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 7 + 256); + EXPECT_EQ(minVal, 256 * 7 + 256); + GetPowerMaxMinTmpSize(powerShape1, powerShape1, true, 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 6); + EXPECT_EQ(minVal, 256 * 6); +} + +TEST_F(TestTiling, TestPowerTilingFactorSize) +{ + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 5); + EXPECT_EQ(extraBuf, 256); + GetPowerTmpBufferFactorSize(false, true, true, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 7); + EXPECT_EQ(extraBuf, 0); + GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 14); + EXPECT_EQ(extraBuf, 256); + GetPowerTmpBufferFactorSize(true, true, false, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 256); + GetPowerTmpBufferFactorSize(true, true, true, 4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 6); + EXPECT_EQ(extraBuf, 0); + GetPowerTmpBufferFactorSize(true, true, false, 2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 14); + EXPECT_EQ(extraBuf, 256); +} + +TEST_F(TestTiling, TestAcosTilingHalf16K) +{ + std::vector shapeDims = { 128, 128 }; + auto acosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 6 * 2); + EXPECT_EQ(minValue, 256 * 6); +} + +TEST_F(TestTiling, TestAcosTilingFloat16K) +{ + std::vector shapeDims = { 128, 128 }; + auto acosShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestAsinhTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto asinhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAsinhMaxMinTmpSize(asinhShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 3); + EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); + + AscendC::GetAsinhMaxMinTmpSize(ge::Shape({32}), 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 3); + EXPECT_EQ(maxValue, 256 * 3); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetAsinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAsinhTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto asinhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAsinhMaxMinTmpSize(asinhShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 3); + EXPECT_EQ(maxValue, 128 * 128 * 3 * 2); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetAsinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAcoshTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto acoshShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAcoshMaxMinTmpSize(acoshShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetAcoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAcoshTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto acoshShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAcoshMaxMinTmpSize(acoshShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetAcoshTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxis) +{ + const auto shape = ge::Shape({ 8, 128 }); + const auto scalarShape = ge::Shape({1}); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 8 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 8 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisLargeShape) +{ + const auto shape = ge::Shape({ 128, 128 }); + const auto scalarShape = ge::Shape({1}); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 4096 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 4096 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisSmallShape) +{ + const auto shape = ge::Shape({ 1, 16 }); + const auto scalarShape = ge::Shape({1}); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 1024); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 1024); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxis) +{ + const auto srcShape = ge::Shape({ 8, 128 }); + const auto scalarShape = ge::Shape({1}); + const auto maskShape = ge::Shape({ 8, 160 }); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 8 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 8 * 2 + 512 + 8 * 128); + EXPECT_EQ(maxValue, 128 * 8 * 2 + 512 + 8 * 128); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 128 * 8 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisLargeShape) +{ + const auto srcShape = ge::Shape({ 128, 128 }); + const auto scalarShape = ge::Shape({1}); + const auto maskShape = ge::Shape({ 128, 160 }); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 4096 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 4096 * 2 + 512 + 128 * 128); + EXPECT_EQ(maxValue, 128 * 128 * 2 + 512 + 128 * 128); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 4096 * 2 + 512); + EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); +} + +TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisSmallShape) +{ + const auto srcShape = ge::Shape({ 1, 16 }); + const auto scalarShape = ge::Shape({1}); + const auto maskShape = ge::Shape({ 1, 32 }); + uint32_t maxValue; + uint32_t minValue; + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 1024); + GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); + EXPECT_EQ(minValue, 1024 + 32); + EXPECT_EQ(maxValue, 1024 + 32); + GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 1024); +} + +TEST_F(TestTiling, TestLayernormTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 4; + + std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; + auto layernormShape = ge::Shape(shapeDims); + const bool isReuseSource = false; + optiling::LayerNormTiling tilling; + + uint32_t minValue = 0; + uint32_t maxValue = 0; + + AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue); + EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize); + EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize); + + AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling); + EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); +} + +TEST_F(TestTiling, TestRmsnormTiling) +{ + constexpr uint32_t bLength = 4; + constexpr uint32_t sLength = 32; + constexpr uint32_t hLength = 16; + constexpr uint32_t bsLength = bLength * sLength; + constexpr uint32_t bshLength = bLength * sLength * hLength; + std::vector shapeDims = {bLength, sLength, hLength}; + auto shape = ge::Shape(shapeDims); + constexpr uint32_t typeSize = 4; + constexpr uint32_t ONE_BLK_FLOAT = 8; + + uint32_t maxValue = 0; + uint32_t minValue = 0; + // common scene + bool res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue); + const uint32_t goldenMax = (bshLength + bsLength) * typeSize; + uint32_t goldenMin = (hLength + ONE_BLK_FLOAT) * typeSize; + EXPECT_EQ(res, true); + EXPECT_EQ(maxValue, goldenMax); + EXPECT_EQ(minValue, goldenMin); + + // basic block scene 1: input shape is illegal, fail to get minSize + res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue, true); + EXPECT_EQ(res, false); + + constexpr uint32_t BASIC_BLK_HLENGTH = 64; + constexpr uint32_t BASIC_BLK_BSLENGTH = 8; + shapeDims[2] = BASIC_BLK_HLENGTH; + auto shape_basic_blk = ge::Shape(shapeDims);// 4,32,64 + // basic block scene 2: get minSize successfully + res = AscendC::GetRmsNormMaxMinTmpSize(shape_basic_blk, typeSize, maxValue, minValue, true); + goldenMin = (64 + 8) * typeSize; + EXPECT_EQ(res, true); + EXPECT_EQ(minValue, goldenMin); + + // basic block scene: get basic block using minTmpSize + // goldenMin should be (BASIC_BLK_HLENGTH(64) * BASIC_BLK_BSLENGTH(8) + bsLength) * typeSize + optiling::RmsNormTiling tiling; + uint32_t tmpSize = (64 + 8) * 4; // shape: 4,32,64 + res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, minValue, typeSize, tiling, true); + EXPECT_EQ(res, true); + EXPECT_EQ(tiling.get_mainBshLength(), 64); + EXPECT_EQ(tiling.get_mainBsLength(), 1); + + auto shape1 = ge::Shape({1,7,16}); + res = AscendC::GetRmsNormMaxMinTmpSize(shape1, typeSize, maxValue, minValue); + goldenMin = (8 + 16) * typeSize; + EXPECT_EQ(minValue, goldenMin); + + uint32_t stackBufferSize = 100 * 1024; + // common scene: get tiling info successfully, shape: 1,7,16 + res = AscendC::GetRmsNormTilingInfo(shape1, shape1, stackBufferSize, typeSize, tiling); + EXPECT_EQ(res, true); + EXPECT_EQ(tiling.get_mainBshLength(), 1*7*16); + EXPECT_EQ(tiling.get_mainBsLength(), 7); + + stackBufferSize = hLength; + // common scene: fail to get tiling info because of small stack buffer + res = AscendC::GetRmsNormTilingInfo(shape, shape, stackBufferSize, typeSize, tiling); + EXPECT_EQ(res, false); + + // basic block scene: get basic block tiling info successfully + stackBufferSize = 100 * 1024; // shape: 4,32,64 + res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, stackBufferSize, typeSize, tiling, true); EXPECT_EQ(res, true); EXPECT_EQ(tiling.get_mainBshLength(), 4*32*64); @@ -1232,6 +1933,39 @@ TEST_F(TestTiling, TestDeepnormTiling) EXPECT_EQ(tiling.get_oneTmpSize(), 512); } +TEST_F(TestTiling, TestExpTiling) +{ + std::vector shapeDims = {128, 128}; + auto expShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + + // float isReuseSrc = false 3 tmpBuffer + AscendC::GetExpMaxMinTmpSize(expShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 3 * 256); + EXPECT_EQ(maxValue, 3 * 128 * 128 * 4); + // float isReuseSrc = true 2 tmpBuffer + AscendC::GetExpMaxMinTmpSize(expShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 2 * 256); + EXPECT_EQ(maxValue, 2 * 128 * 128 * 4); + // half 4 tmpBuffer + AscendC::GetExpMaxMinTmpSize(expShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 4 * 256); + EXPECT_EQ(maxValue, 4 * 128 * 128 * 4); + AscendC::GetExpMaxMinTmpSize(expShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 4 * 256); + EXPECT_EQ(maxValue, 4 * 128 * 128 * 4); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetExpTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); + AscendC::GetExpTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 8); + EXPECT_EQ(extraBuf, 0); +} + TEST_F(TestTiling, TestMatmulApiTilngFactorSplit1) { MatmulApiTiling tiling; @@ -1573,6 +2307,7 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM4) EXPECT_EQ(res, 0); } + TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadCase) { optiling::TCubeTiling tilingData; @@ -1614,13 +2349,66 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM5) EXPECT_EQ(res, 0); } -TEST_F(TestTiling, TestLayernormGradTiling) +TEST_F(TestTiling, TestConcatTiling) { - const uint32_t stackBufferSize = 100 * 1024; + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + const uint32_t elemCount = 128; - std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; - auto layernormgradShape = ge::Shape(shapeDims); - optiling::LayerNormGradTiling tiling; + AscendC::GetConcatTmpSize(plat, elemCount, 2); +} + +TEST_F(TestTiling, TestSortTiling) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + const uint32_t elemCount = 128; + + AscendC::GetSortTmpSize(plat, elemCount, 4); +} + +TEST_F(TestTiling, TestUnPadTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 4; + + std::vector shapeDims = { 32, 32 }; + auto srcShape = ge::Shape(shapeDims); + optiling::UnPadTiling tiling; + + AscendC::UnPadTilingFunc(srcShape, 0, typeSize, tiling); + AscendC::UnPadTilingFunc(srcShape, stackBufferSize, typeSize, tiling); + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetUnPadMaxMinTmpSize(plat, srcShape, typeSize, maxValue, minValue); +} + +TEST_F(TestTiling, TestPadTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 4; + + std::vector shapeDims = { 32, 32}; + std::vector ori_shape_dims = { 32, 31 }; + auto srcShape = ge::Shape(shapeDims); + auto oriSrcShape = ge::Shape(ori_shape_dims); + optiling::PadTiling tiling; + + AscendC::PadTilingFunc(srcShape, oriSrcShape, stackBufferSize, typeSize, tiling); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetPadMaxMinTmpSize(srcShape, typeSize, maxValue, minValue); +} + +TEST_F(TestTiling, TestLayernormGradTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + + std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; + auto layernormgradShape = ge::Shape(shapeDims); + optiling::LayerNormGradTiling tiling; AscendC::GetLayerNormGradNDTilingInfo(layernormgradShape, stackBufferSize, 4, false, tiling); EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize); @@ -1665,6 +2453,23 @@ TEST_F(TestTiling, TestLayernormGradBetaTiling) EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize / sizeof(float)); } +TEST_F(TestTiling, TestConfusionTransposeTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 2; + + std::vector shapeDims = { 1, 2, 64, 32 }; + auto srcShape = ge::Shape(shapeDims); + optiling::ConfusionTransposeTiling tiling; + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 1, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 2, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 3, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 4, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 5, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 6, tiling); + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 7, tiling); +} + TEST_F(TestTiling, TestMatmulApiTilngL0BNoDB) { MatmulApiTiling tiling; @@ -2001,6 +2806,129 @@ TEST_F(TestTiling, TestMatmulApiTilngSetShapeZero) EXPECT_EQ(ret, -1); } +// #if __CCE_AICORE__ == 200 +// TEST_F(TestTiling, TestPlatformAscendC) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); +// EXPECT_EQ(plat.GetCoreNumVector(), 8); +// EXPECT_EQ(plat.GetCoreNumVector() + plat.GetCoreNumAic() , 18); +// } +// #endif + +// #if __CCE_AICORE__ == 220 +// extern void platfrom_stub_set_num_aic(const char *num); +// extern void platfrom_stub_set_num_aiv(const char *num); +// extern void platfrom_stub_set_num_cub(const char *num); +// extern void platfrom_stub_set_ctl(const char *num); +// extern void platfrom_stub_set_chip_version(const char *num); +// extern void platfrom_stub_set_num(uint32_t num); +// TEST_F(TestTiling, TestPlatformAscendC) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); +// uint64_t ub_size, l1_size, l0; +// uint64_t l2_bw, hbm_bw, bw; +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size); +// EXPECT_EQ(ub_size, 196352); +// EXPECT_EQ(l1_size, 524032); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0); +// EXPECT_EQ(l0, 65536); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0); +// EXPECT_EQ(l0, 65536); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0); +// EXPECT_EQ(l0, 65536 * 2); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw); +// EXPECT_EQ(l2_bw, 110); +// EXPECT_EQ(hbm_bw, 32); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw); +// EXPECT_EQ(plat.GetCoreNum(), 48); +// EXPECT_EQ(plat.GetCoreNumAic(), 24); +// EXPECT_EQ(plat.GetCoreNumAiv(), 48); +// platfrom_stub_set_num_cub("20"); +// EXPECT_EQ(plat.GetCoreNumAic(), 20); +// platfrom_stub_set_num_aiv("40"); +// EXPECT_EQ(plat.GetCoreNumAiv(), 40); +// platfrom_stub_set_ctl("AICore"); +// EXPECT_EQ(plat.GetCoreNumAic(), 24); +// EXPECT_EQ(plat.GetCoreNumAiv(), 24); +// platfrom_stub_set_num_aic("20"); +// EXPECT_EQ(plat.GetCoreNumAic(), 20); +// EXPECT_EQ(plat.GetCoreNumAiv(), 20); +// EXPECT_EQ(bw, 0); +// EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2); + +// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 16 * 1024 * 1024); +// platfrom_stub_set_chip_version("Ascend910"); +// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2 * 1024 * 1024); +// EXPECT_EQ(plat.GetSocVersion(), platform_ascendc::SocVersion::ASCEND910); +// EXPECT_EQ(plat.GetCoreNumVector(), 0); +// } +// #endif + +// #if __CCE_AICORE__ == 300 +// extern void platfrom_stub_set_num_aic(const char *num); +// extern void platfrom_stub_set_num_aiv(const char *num); +// extern void platfrom_stub_set_num_cub(const char *num); +// extern void platfrom_stub_set_ctl(const char *num); +// extern void platfrom_stub_set_chip_version(const char *num); +// extern void platfrom_stub_set_num(uint32_t num); +// TEST_F(TestTiling, TestPlatformAscendC) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); +// uint64_t ub_size, l1_size, l0; +// uint64_t l2_bw, hbm_bw, bw; +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size); +// EXPECT_EQ(ub_size, 248 * 1024); +// EXPECT_EQ(l1_size, 1024 * 1024); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0); +// EXPECT_EQ(l0, 65536); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0); +// EXPECT_EQ(l0, 65536); +// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0); +// EXPECT_EQ(l0, 65536 * 2); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw); +// EXPECT_EQ(l2_bw, 256); +// EXPECT_EQ(hbm_bw, 17); +// plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw); +// EXPECT_EQ(plat.GetCoreNum(), 1); +// EXPECT_EQ(plat.GetCoreNumAic(), 1); +// EXPECT_EQ(plat.GetCoreNumAiv(), 1); +// platfrom_stub_set_num_cub("1"); +// EXPECT_EQ(plat.GetCoreNumAic(), 1); +// platfrom_stub_set_num_aiv("1"); +// EXPECT_EQ(plat.GetCoreNumAiv(), 1); +// platfrom_stub_set_ctl("AICore"); +// EXPECT_EQ(plat.GetCoreNumAic(), 1); +// EXPECT_EQ(plat.GetCoreNumAiv(), 1); +// platfrom_stub_set_num_aic("2"); +// EXPECT_EQ(plat.GetCoreNumAic(), 2); +// EXPECT_EQ(plat.GetCoreNumAiv(), 2); +// EXPECT_EQ(bw, 0); +// EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1); +// EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2); +// EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2); + +// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2097152); +// EXPECT_EQ(plat.GetCoreNumVector(), 0); +// } +// #endif + TEST_F(TestTiling, TestMatmulApiTilngInt8Case1) { MatmulApiTiling tiling; @@ -2279,6 +3207,60 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case9) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, TestErfTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto erfShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetErfMaxMinTmpSize(erfShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); + EXPECT_EQ(minValue, 256 * 3); +} + +TEST_F(TestTiling, TestErfTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto erfShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetErfMaxMinTmpSize(erfShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 8); + EXPECT_EQ(minValue, 256 * 8); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetErfTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 8); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestErfcTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto erfcShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetErfcMaxMinTmpSize(erfcShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 7 * 4); + EXPECT_EQ(minValue, 256 * 7); +} + +TEST_F(TestTiling, TestErfcTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto erfcShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetErfcMaxMinTmpSize(erfcShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 16); + EXPECT_EQ(minValue, 256 * 16); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetErfcTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 16); + EXPECT_EQ(extraBuf, 0); +} + TEST_F(TestTiling, TestMatmulApiTilngInt8Case10) { MultiCoreMatmulTiling tiling; @@ -2357,6 +3339,95 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case13) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, TestCoshTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto coshShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestCoshTilingFloat512) +{ + std::vector shapeDims = { 512 }; + auto coshShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 4 * 2); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestCoshTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto coshShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCoshMaxMinTmpSize(coshShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 6); + EXPECT_EQ(minValue, 256 * 6); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetCoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 6); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSinTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto sinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetSinMaxMinTmpSize(sinShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 2 * 256); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); + GetSinMaxMinTmpSize(sinShape, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 3 * 256); + EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetSinTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSinTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto sinShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetSinMaxMinTmpSize(sinShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 8 * 2); + EXPECT_EQ(minValue, 8 * 256); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetSinTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 8); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAscendSumTiling) +{ + uint32_t n = 8; + uint32_t maxValue; + uint32_t minValue; + GetSumMaxMinTmpSize(n, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 32); + EXPECT_EQ(maxValue, 32); + + maxValue = 0; + minValue = 0; + GetSumMaxMinTmpSize(n, 4, false, maxValue, minValue); + EXPECT_EQ(minValue, 32); + EXPECT_EQ(maxValue, 32); +} + TEST_F(TestTiling, TestAscendSiluTiling) { std::vector shapeDims = { 512 }; @@ -2379,6 +3450,54 @@ TEST_F(TestTiling, TestAscendSwishTiling) EXPECT_EQ(maxValue, 0); } +TEST_F(TestTiling, TestAscendXorTiling) +{ + std::vector shapeDims = { 128, 128 }; + auto xorShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetXorMaxMinTmpSize(xorShape, 2, true, maxValue, minValue); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 2); + EXPECT_EQ(minValue, 1 * 256); + GetXorTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestFracTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto fracShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFracMaxMinTmpSize(fracShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetFracTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestFracTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto fracShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFracMaxMinTmpSize(fracShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 1024); + EXPECT_EQ(maxValue, 131072); + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetFracTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + #if __CCE_AICORE__ == 220 extern void platfrom_stub_set_chip_version(const char *num); TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64) @@ -2861,6 +3980,15 @@ TEST_F(TestTiling, TestTopkTiling_TopKModeSmall310P_HALF) } #endif +TEST_F(TestTiling, TestArithProgression) +{ + uint32_t maxValue; + uint32_t minValue; + GetArithProgressionMaxMinTmpSize(maxValue, minValue); + EXPECT_EQ(maxValue, 0); + EXPECT_EQ(minValue, 0); +} + TEST_F(TestTiling, TestGeGLUTilingFloat) { std::vector shapeDims = { 128, 128 }; @@ -2895,6 +4023,566 @@ TEST_F(TestTiling, TestGeGLUTilingHalf) EXPECT_EQ(extraBuf, 0); } +TEST_F(TestTiling, TestLgammaTilingFp32) +{ + std::vector shapeDims = { 128, 128 }; + auto shape = ge::Shape(shapeDims); + uint32_t maxSize; + uint32_t minSize; + GetLgammaMaxMinTmpSize(shape, 4, true, maxSize, minSize); + EXPECT_EQ(maxSize, 458752); + EXPECT_EQ(minSize, 1792); + + GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); + EXPECT_EQ(maxSize, 524288); + EXPECT_EQ(minSize, 2048); + + shapeDims = { 8 }; + shape = ge::Shape(shapeDims); + GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); + EXPECT_EQ(maxSize, 2048); + EXPECT_EQ(minSize, 2048); + + GetLgammaMaxMinTmpSize(shape, 4, true,maxSize, minSize); + EXPECT_EQ(maxSize, 1792); + EXPECT_EQ(minSize, 1792); + + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + GetLgammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 8); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestLgammaTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto shape = ge::Shape(shapeDims); + uint32_t maxSize; + uint32_t minSize; + + GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 128 * 128 * 2 * 13 * 2); + EXPECT_EQ(minSize, 13 * 2 * 256); + + shapeDims = { 8 }; + shape = ge::Shape(shapeDims); + GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 256 * 13 * 2); + EXPECT_EQ(minSize, 256 * 13 * 2); + + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + GetLgammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 13); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestDigammaTilingFp32) +{ + std::vector shapeDims = { 128, 128 }; + auto shape = ge::Shape(shapeDims); + uint32_t maxSize; + uint32_t minSize; + GetDigammaMaxMinTmpSize(shape, 4, true, maxSize, minSize); + EXPECT_EQ(maxSize, 393216); + EXPECT_EQ(minSize, 1536); + + GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); + EXPECT_EQ(maxSize, 458752); + EXPECT_EQ(minSize, 1792); + + shapeDims = { 8 }; + shape = ge::Shape(shapeDims); + GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); + EXPECT_EQ(maxSize, 1792); + EXPECT_EQ(minSize, 1792); + + GetDigammaMaxMinTmpSize(shape, 4, true,maxSize, minSize); + EXPECT_EQ(maxSize, 1536); + EXPECT_EQ(minSize, 1536); + + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + GetDigammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 7); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestDigammaTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto shape = ge::Shape(shapeDims); + uint32_t maxSize; + uint32_t minSize; + + GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 128 * 128 * 2 * 8 * 2); + EXPECT_EQ(minSize, 8 * 2 * 256); + + shapeDims = { 8 }; + shape = ge::Shape(shapeDims); + GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 256 * 8 * 2); + EXPECT_EQ(minSize, 256 * 8 * 2); + + uint32_t maxLiveNodeCnt = 0xffff; + uint32_t extraBuf = 0xffff; + GetDigammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 16); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAtanhTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto aTanhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 4 * 1); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetAtanhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAtanhTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto aTanhShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 4); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + AscendC::GetAtanhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 4); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestSignTiling) +{ + std::vector shapeDims = { 128, 128 }; + auto signShape = ge::Shape(shapeDims); + uint32_t signNeedMaxSize; + uint32_t signNeedMinSize; + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetSignMaxMinTmpSize(signShape, 2, false, signNeedMaxSize, signNeedMinSize); + EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 2); + EXPECT_EQ(signNeedMinSize, 3 * 256); + + GetSignMaxMinTmpSize(signShape, 4, false, signNeedMaxSize, signNeedMinSize); + EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 4); + EXPECT_EQ(signNeedMinSize, 3 * 256); + + GetSignTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 3); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestAscendMeanTiling) +{ + uint32_t n = 8; + uint32_t maxValue; + uint32_t minValue; + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + + GetMeanMaxMinTmpSize(n, 2, 2, true, maxValue, minValue); + EXPECT_EQ(minValue, 32); + EXPECT_EQ(maxValue, 32); + + maxValue = 0; + minValue = 0; + GetMeanMaxMinTmpSize(n, 4, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 32); + EXPECT_EQ(maxValue, 32); + + GetMeanMaxMinTmpSize(n, 2, 4, true, maxValue, minValue); + EXPECT_EQ(minValue, 96); + EXPECT_EQ(maxValue, 96); + + GetMeanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +// TEST_F(TestTiling, TestKernelContextBuildBase) +// { +// auto builder = context_ascendc::BuildKernelRunContext(2, 2); +// EXPECT_EQ(builder.kernelInputNum, 2); +// } + + +// TEST_F(TestTiling, TestKernelContextBuild) +// { +// gert::Shape input1_shape = {2, 1, 1, 1, 1, 1, 1, 2, 2}; +// int32_t input1_tensor_buffer[] = {0, 2, 3, 3, 1, 0, 0, 1}; +// gert::TensorData input1_tensor_data{(void*)input1_tensor_buffer, nullptr}; +// gert::Shape output_shape = {5, 3}; +// int64_t output_tensor_buffer[15]; +// gert::TensorData output_tensor_data{(void*)output_tensor_buffer, nullptr}; +// auto kernelHolder = +// context_ascendc::KernelRunContextBuilder() +// .KernelIONum(2, 2) +// .Inputs({reinterpret_cast(&input1_shape), +// reinterpret_cast(&input1_tensor_data)}) +// .Outputs({reinterpret_cast(&output_shape), reinterpret_cast(&output_tensor_data)}) +// .NodeIoNum(1, 1) +// .IrInputNum(1) +// .NodeInputTd(0, ge::DT_INT32, ge::FORMAT_ND, ge::FORMAT_ND) +// .NodeOutputTd(0, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND) +// .Build(); +// auto context = kernelHolder.GetContext(); +// EXPECT_NE(context, nullptr); +// } + +// TEST_F(TestTiling, TestTilingContextBuildWithConstValue) +// { +// string active_type = "gelu"; +// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; +// gert::StorageShape expert_tokens_shape = {{16}, {16}}; +// gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}}; +// gert::StorageShape bias1_shape = {{16, 0}, {16, 0}}; +// gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}}; +// gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}}; + +// gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}}; + +// std::vector expert_tokens_const_value (16, 1); +// std::vector x_const_value (1024 * 5120, 2.f); +// std::vector bias2_value (16 * 5120, 3.f); +// auto param = gert::TilingData::CreateCap(4096); +// auto workspace_size_holer = gert::ContinuousVector::Create(4096); +// auto ws_size = reinterpret_cast(workspace_size_holer.get()); +// auto holder = context_ascendc::TilingContextBuilder() +// .SetOpNameType("name", "tpye") +// .NodeIoNum(6, 1) +// .IrInstanceNum({1, 1, 1, 1, 1, 1}) +// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) +// .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape) +// .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape) +// .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, reinterpret_cast(expert_tokens_const_value.data())) +// .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape) +// .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast(bias2_value.data())) +// .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape) +// .AddAttrs({ +// {"activation", ge::AnyValue::CreateFrom(active_type)}, +// {"inner_precise", ge::AnyValue::CreateFrom(1)} +// }) +// .TilingData(param.get()) +// .Workspace(ws_size) +// .Build(); + +// gert::TilingContext* tiling_context = holder.GetContext(); +// EXPECT_NE(tiling_context, nullptr); + +// } + +// TEST_F(TestTiling, TestTilingContextBuildAddInputs) +// { +// string active_type = "gelu"; +// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; +// std::vector inputs; +// std::vector outputs; +// context_ascendc::TensorInfo input; +// input.shape = x_shape; +// input.dType = ge::DT_FLOAT16; +// input.oriFormat = ge::FORMAT_ND; +// input.format = ge::FORMAT_ND; +// input.dataPath = "1111"; +// inputs.push_back(input); +// context_ascendc::TensorInfo output; +// output.shape = x_shape; +// output.dType = ge::DT_FLOAT16; +// output.oriFormat = ge::FORMAT_ND; +// output.format = ge::FORMAT_ND; +// output.dataPath = "222"; +// outputs.push_back(output); + +// auto param = gert::TilingData::CreateCap(4096); +// auto workspace_size_holer = gert::ContinuousVector::Create(4096); +// auto ws_size = reinterpret_cast(workspace_size_holer.get()); +// auto holder = context_ascendc::TilingContextBuilder() +// .SetOpNameType("name", "tpye") +// .NodeIoNum(1, 1) +// .IrInstanceNum({1}) +// .AddInputs(inputs) +// .AddOutputs(outputs) +// .AddAttrs({ +// {"activation", ge::AnyValue::CreateFrom(active_type)}, +// {"inner_precise", ge::AnyValue::CreateFrom(1)} +// }) +// .TilingData(param.get()) +// .Workspace(ws_size) +// .Build(); + +// gert::TilingContext* tiling_context = holder.GetContext(); +// EXPECT_NE(tiling_context, nullptr); +// } + +// TEST_F(TestTiling, TestTilingContextBuildFailed) +// { +// string active_type = "gelu"; +// gert::StorageShape x_shape = {{-1, 5120}, {-1, 5120}}; +// std::vector x_const_value (1024 * 5120, 2.f); +// auto param = gert::TilingData::CreateCap(4096); +// auto workspace_size_holer = gert::ContinuousVector::Create(4096); +// auto ws_size = reinterpret_cast(workspace_size_holer.get()); +// auto holder = context_ascendc::TilingContextBuilder() +// .NodeIoNum(1, 1) +// .IrInstanceNum({1, 1}) +// .CompileInfo(nullptr) +// .PlatformInfo(nullptr) +// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) +// .Workspace(ws_size) +// .Build(); + +// gert::TilingContext* tiling_context = holder.GetContext(); +// EXPECT_EQ(tiling_context, nullptr); +// } + +// TEST_F(TestTiling, TestTilingContextBuildWithBinFile) +// { +// string active_type = "gelu"; +// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; +// gert::StorageShape expert_tokens_shape = {{16}, {16}}; +// gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}}; +// gert::StorageShape bias1_shape = {{16, 0}, {16, 0}}; +// gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}}; +// gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}}; +// gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}}; + +// std::vector expert_tokens_const_value (16, 1); + +// std::vector x_const_value (1024 * 5120, 2.f); +// std::vector bias2_value (16 * 5120, 3.f); +// auto param = gert::TilingData::CreateCap(4096); +// auto workspace_size_holer = gert::ContinuousVector::Create(4096); +// auto ws_size = reinterpret_cast(workspace_size_holer.get()); +// auto holder = context_ascendc::TilingContextBuilder() +// .SetOpNameType("name", "tpye") +// .NodeIoNum(6, 1) +// .IrInstanceNum({1, 1, 1, 1, 1, 1}) +// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) +// .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape) +// .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape) +// .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, "./expert_tokens_data.bin") +// .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape) +// .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast(bias2_value.data())) +// .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape) +// .AddAttrs({ +// {"activation", ge::AnyValue::CreateFrom(active_type)}, +// {"inner_precise", ge::AnyValue::CreateFrom(1)} +// }) +// .TilingData(param.get()) +// .Workspace(ws_size) +// .Build(); +// gert::TilingContext* tiling_context = holder.GetContext(); +// EXPECT_EQ(tiling_context, nullptr); +// } + +TEST_F(TestTiling, TestAxpyTiling) +{ + uint32_t maxVal = 0; + uint32_t minVal = 0; + GetAxpyMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal); + EXPECT_EQ(maxVal, 0); + EXPECT_EQ(minVal, 0); + GetAxpyMaxMinTmpSize(ge::Shape({256}), 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 4 * 2); + EXPECT_EQ(minVal, 256 * 4); + GetAxpyMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal); + EXPECT_EQ(maxVal, 256 * 4); + EXPECT_EQ(minVal, 256 * 4); + uint32_t extraBuf = 123; + uint32_t maxLivedNodesCnt = 123; + GetAxpyTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf); + EXPECT_EQ(extraBuf, 0); + EXPECT_EQ(maxLivedNodesCnt, 1); + GetAxpyTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf); + EXPECT_EQ(extraBuf, 0); + EXPECT_EQ(maxLivedNodesCnt, 4); +} + +TEST_F(TestTiling, TestCeilTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto ceilShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCeilMaxMinTmpSize(ceilShape, sizeof(float), false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 1); + EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetCeilTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 1); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestCeilTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto ceilShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetCeilTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestCeilTilingHalf512) +{ + std::vector shapeDims = { 512 }; + auto ceilShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 2 * 2); + EXPECT_EQ(minValue, 256 * 2); +} + +TEST_F(TestTiling, TestFloorTilingFloat) +{ + std::vector shapeDims = { 128, 128 }; + auto floorShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFloorMaxMinTmpSize(floorShape, sizeof(float), false, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetFloorTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 0); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestFloorTilingHalf) +{ + std::vector shapeDims = { 128, 128 }; + auto floorShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue); + EXPECT_EQ(minValue, 256 * 2); + EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); + + uint32_t maxLiveNodeCnt; + uint32_t extraBuf; + GetFloorTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); + EXPECT_EQ(maxLiveNodeCnt, 2); + EXPECT_EQ(extraBuf, 0); +} + +TEST_F(TestTiling, TestFloorTilingHalf512) +{ + std::vector shapeDims = { 512 }; + auto floorShape = ge::Shape(shapeDims); + uint32_t maxValue = 0; + uint32_t minValue = 0; + GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue); + EXPECT_EQ(maxValue, 512 * 2 * 2); + EXPECT_EQ(minValue, 256 * 2); +} + +// TEST_F(TestTiling, TestGetSocVersion) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); + +// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, +// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) +// .stubs() +// .will(returnValue(false)); + +// platform_ascendc::SocVersion ret = plat.GetSocVersion(); +// EXPECT_EQ(ret, platform_ascendc::SocVersion::RESERVED_VERSION); +// } + +// TEST_F(TestTiling, TestCoreNum) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); + +// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, +// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) +// .stubs() +// .will(returnValue(false)); + +// uint32_t ret1 = plat.GetCoreNumAic(); +// uint32_t ret2 = plat.GetCoreNumAiv(); +// EXPECT_EQ(ret1, 0); +// EXPECT_EQ(ret2, 0); +// } + +// TEST_F(TestTiling, TestGetLibApiWorkSpaceSize) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); + +// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, +// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) +// .stubs() +// .will(returnValue(false)); + +// uint32_t ret1 = plat.GetLibApiWorkSpaceSize(); +// EXPECT_EQ(ret1, static_cast(-1)); +// } +// TEST_F(TestTiling, TestPlatformAscendCManager) +// { +// void *handle; +// int a = 7; +// handle = &a; + +// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, +// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) +// .stubs() +// .will(returnValue(false)); + +// auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance(); +// } + +// TEST_F(TestTiling, TestGetVectorCoreNum) +// { +// fe::PlatFormInfos platform_info; +// auto plat = platform_ascendc::PlatformAscendC(&platform_info); + +// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, +// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) +// .stubs() +// .will(returnValue(false)); +// MOCKER_CPP(&platform_ascendc::PlatformAscendC::GetSocVersion, +// platform_ascendc::SocVersion(platform_ascendc::PlatformAscendC::*)(void) const) +// .stubs() +// .will(returnValue(platform_ascendc::SocVersion::ASCEND310P)); + +// uint32_t ret1 = plat.GetCoreNumVector(); +// EXPECT_EQ(ret1, static_cast(0)); +// MOCKER_CPP(&platform_ascendc::PlatformAscendCManager::PlatformAscendCInit) +// .stubs() +// .will(returnValue(platform_info)); +// auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance(); + +// } + TEST_F(TestTiling, TestReGluFloat16OrBf16) { const std::vector srcShapeDims = { 8, 128 }; @@ -2915,4 +4603,234 @@ TEST_F(TestTiling, TestReGluFloat32) GetReGluMaxMinTmpSize(srcShape, 4, false, maxValue, minValue); EXPECT_EQ(minValue, 256); EXPECT_EQ(maxValue, 256); +} + +#if __CCE_AICORE__ == 220 +extern void platfrom_stub_set_chip_version(const char *num); +TEST_F(TestTiling, TestBroadCast220) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + platfrom_stub_set_chip_version("Ascend910B"); + uint32_t firstDim = 32; + uint32_t lastDim = 32; + std::vector srcShapeDims = {firstDim, 1}; + auto srcShape = ge::Shape(srcShapeDims); + std::vector dstShapeDims = {firstDim, lastDim}; + auto dstShape = ge::Shape(dstShapeDims); + uint32_t maxValue{0}; + uint32_t minValue{0}; + constexpr uint32_t halfSize = 2; + constexpr uint32_t halfOneBlockElementNum = 16; + constexpr uint32_t minHalfAlignSize = halfOneBlockElementNum * halfOneBlockElementNum * halfSize; + constexpr uint32_t BRCB_ONE_SIZE = 8; + uint32_t firstDimAlignNum = (firstDim + BRCB_ONE_SIZE - 1) / BRCB_ONE_SIZE * BRCB_ONE_SIZE; + uint32_t maxHalfAlignSize = firstDimAlignNum * halfOneBlockElementNum * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfAlignSize); + EXPECT_EQ(maxValue, maxHalfAlignSize); + + srcShapeDims = {firstDim, 1}; + srcShape = ge::Shape(srcShapeDims); + uint32_t lastDimNotAlign = 31; + dstShapeDims = {firstDim, lastDimNotAlign}; + dstShape = ge::Shape(dstShapeDims); + + uint32_t blockDimAlignBlockNum = (lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum; + uint32_t blockDimAlign = blockDimAlignBlockNum * halfOneBlockElementNum; + uint32_t minCopyTempBufferSize = halfOneBlockElementNum * blockDimAlign * halfSize; + auto minHalfNotAlignSize = minHalfAlignSize + minCopyTempBufferSize; + + uint32_t maxCopyTempBufferSize = firstDim * blockDimAlign * halfSize; + uint32_t maxHalfNotAlignValue = maxHalfAlignSize + maxCopyTempBufferSize; + + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfNotAlignSize); + EXPECT_EQ(maxValue, maxHalfNotAlignValue); + + constexpr uint32_t int8Size = 1; + srcShapeDims = {firstDim, 1}; + srcShape = ge::Shape(srcShapeDims); + dstShapeDims = {firstDim, lastDim}; + dstShape = ge::Shape(dstShapeDims); + const uint32_t alignSrcSize = + ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; + uint32_t alignDstSize = + ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; + uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize); + EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize); + + srcShapeDims = {firstDim, 1}; + srcShape = ge::Shape(srcShapeDims); + dstShapeDims = {firstDim, lastDimNotAlign}; + dstShape = ge::Shape(dstShapeDims); + alignDstSize = + ((firstDim * lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; + castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfNotAlignSize + castTempBufferSize); + EXPECT_EQ(maxValue, maxHalfNotAlignValue + castTempBufferSize); +} +#endif + +#if __CCE_AICORE__ == 200 +extern void platfrom_stub_set_chip_version(const char *num); +TEST_F(TestTiling, TestLastBroadCast200) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + platfrom_stub_set_chip_version("Ascend310P"); + uint32_t firstDim = 32; + uint32_t lastDim = 32; + std::vector srcShapeDims = {firstDim, 1}; + auto srcShape = ge::Shape(srcShapeDims); + std::vector dstShapeDims = {firstDim, lastDim}; + auto dstShape = ge::Shape(dstShapeDims); + uint32_t maxValue{0}; + uint32_t minValue{0}; + constexpr uint32_t halfSize = 2; + constexpr uint32_t halfOneBlockElementNum = 16; + constexpr uint32_t MAX_BLOCK_NUM = 8; + constexpr uint32_t ONE_BLOCK_SIZE = 32; + uint32_t minTmpBufferSize = + halfOneBlockElementNum * ((lastDim + MAX_BLOCK_NUM - 1) / MAX_BLOCK_NUM) * halfSize; + uint32_t minHalfAlignSize = ONE_BLOCK_SIZE + + minTmpBufferSize; + uint32_t maxHalfAlignSize = ONE_BLOCK_SIZE + firstDim * lastDim * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfAlignSize); + EXPECT_EQ(maxValue, maxHalfAlignSize); + + constexpr uint32_t int8Size = 1; + const uint32_t alignSrcSize = + ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; + const uint32_t alignDstSize = + ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; + const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize); + EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize); +} + +TEST_F(TestTiling, TestFirstBroadCast200) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + platfrom_stub_set_chip_version("Ascend310P"); + uint32_t firstDim = 32; + uint32_t lastDim = 32; + std::vector srcShapeDims = {1, lastDim}; + auto srcShape = ge::Shape(srcShapeDims); + std::vector dstShapeDims = {firstDim, lastDim}; + auto dstShape = ge::Shape(dstShapeDims); + uint32_t maxValue{0}; + uint32_t minValue{0}; + constexpr uint32_t halfSize = 2; + constexpr uint32_t ONE_BLOCK_SIZE = 32; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); + EXPECT_EQ(minValue, ONE_BLOCK_SIZE); + EXPECT_EQ(maxValue, ONE_BLOCK_SIZE); + + constexpr uint32_t int8Size = 1; + constexpr uint32_t HALF_ONE_BLK_SIZE = 16; + const uint32_t alignSrcSize = ((lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; + const uint32_t alignDstSize = + ((firstDim * lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; + const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); + EXPECT_EQ(minValue, ONE_BLOCK_SIZE + castTempBufferSize); + EXPECT_EQ(maxValue, ONE_BLOCK_SIZE + castTempBufferSize); +} + +TEST_F(TestTiling, TestOneElementBroadCast200) +{ + fe::PlatFormInfos platform_info; + auto plat = platform_ascendc::PlatformAscendC(&platform_info); + platfrom_stub_set_chip_version("Ascend310P"); + uint32_t srcDim = 1; + uint32_t dstDim = 32; + std::vector srcShapeDims = {srcDim}; + auto srcShape = ge::Shape(srcShapeDims); + std::vector dstShapeDims = {dstDim}; + auto dstShape = ge::Shape(dstShapeDims); + uint32_t maxValue{0}; + uint32_t minValue{0}; + constexpr uint32_t halfSize = 2; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); + + constexpr uint32_t int8Size = 1; + constexpr uint32_t HALF_ONE_BLK_SIZE = 16; + constexpr uint32_t ONE_BLOCK_SIZE = 32; + const uint32_t alignSrcSize = ((srcDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; + const uint32_t alignDstSize = ((dstDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; + const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; + GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); + EXPECT_EQ(minValue, castTempBufferSize + ONE_BLOCK_SIZE); + EXPECT_EQ(maxValue, castTempBufferSize + ONE_BLOCK_SIZE); +} +#endif + +TEST_F(TestTiling, TestReduceXorSumTilingInt16) +{ + std::vector shapeDims = { 128, 128 }; + auto shape = ge::Shape(shapeDims); + uint32_t maxSize; + uint32_t minSize; + GetReduceXorSumMaxMinTmpSize(shape, 2, true, maxSize, minSize); + EXPECT_EQ(maxSize, 65536); + EXPECT_EQ(minSize, 65536); + + GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 98304); + EXPECT_EQ(minSize, 98304); + + shapeDims = { 8 }; + shape = ge::Shape(shapeDims); + GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize); + EXPECT_EQ(maxSize, 768); + EXPECT_EQ(minSize, 768); + + GetReduceXorSumMaxMinTmpSize(shape, 2, true,maxSize, minSize); + EXPECT_EQ(maxSize, 512); + EXPECT_EQ(minSize, 512); +} + +TEST_F(TestTiling, TestCumSum) +{ + uint32_t firstDim = 32; + uint32_t lastDim = 16; + std::vector srcShapeDims = {firstDim, lastDim}; + auto srcShape = ge::Shape(srcShapeDims); + uint32_t maxValue{0}; + uint32_t minValue{0}; + constexpr uint32_t halfSize = 2; + constexpr uint32_t transDataTo5HDAddrListSize = 16; + uint32_t minHalfSize = transDataTo5HDAddrListSize * lastDim * 3 * sizeof(uint16_t); + uint32_t alignOutter = (firstDim + transDataTo5HDAddrListSize - 1) / transDataTo5HDAddrListSize * transDataTo5HDAddrListSize; + uint32_t maxHalfSize = alignOutter * lastDim * 3 * sizeof(uint16_t); + + GetCumSumMaxMinTmpSize(srcShape, halfSize, true, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfSize); + EXPECT_EQ(maxValue, maxHalfSize); + + constexpr uint32_t floatSize = 4; + uint32_t minFloatSize = transDataTo5HDAddrListSize * lastDim * 2 * sizeof(float); + uint32_t maxFloatSize = alignOutter * lastDim * 2 * sizeof(float); + + GetCumSumMaxMinTmpSize(srcShape, floatSize, true, false, maxValue, minValue); + EXPECT_EQ(minValue, minFloatSize); + EXPECT_EQ(maxValue, maxFloatSize); + + maxHalfSize = minHalfSize = firstDim * lastDim * sizeof(float); + GetCumSumMaxMinTmpSize(srcShape, halfSize, false, false, maxValue, minValue); + EXPECT_EQ(minValue, minHalfSize); + EXPECT_EQ(maxValue, maxHalfSize); + + + GetCumSumMaxMinTmpSize(srcShape, floatSize, false, false, maxValue, minValue); + EXPECT_EQ(minValue, 0); + EXPECT_EQ(maxValue, 0); } \ No newline at end of file -- Gitee From ea034752877efc88756265abb69ff848d210b53b Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 06:26:26 +0000 Subject: [PATCH 3/8] TBuf related api add Signed-off-by: jiangchengcheng-on --- .../cmatrix_buffer/matmul_l0c_buffer.h | 143 ++++++ .../modules/feature_trait/matmul_chip_cap.h | 71 +++ .../feature_trait/matmul_feature_trait.h | 40 ++ .../feature_trait/matmul_iter_ctrl_cfg.h | 30 ++ .../input_cache/matmul_input_l1_cache.h | 48 ++ .../iterator/matmul_iterate_controller.h | 124 +++++ impl/matmul/modules/matmul_module.h | 41 ++ impl/matmul/modules/matmul_params.h | 470 ++++++++++++++++++ impl/matmul/modules/matmul_type_def.h | 33 ++ 9 files changed, 1000 insertions(+) create mode 100644 impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h create mode 100644 impl/matmul/modules/feature_trait/matmul_chip_cap.h create mode 100644 impl/matmul/modules/feature_trait/matmul_feature_trait.h create mode 100644 impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h create mode 100644 impl/matmul/modules/input_cache/matmul_input_l1_cache.h create mode 100644 impl/matmul/modules/iterator/matmul_iterate_controller.h create mode 100644 impl/matmul/modules/matmul_module.h create mode 100644 impl/matmul/modules/matmul_params.h create mode 100644 impl/matmul/modules/matmul_type_def.h diff --git a/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h b/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h new file mode 100644 index 00000000..90f55078 --- /dev/null +++ b/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h @@ -0,0 +1,143 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_unit_flag_params.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_L0C_BUFFER_H +#define IMPL_MATMUL_MODULES_MATMUL_L0C_BUFFER_H + +#include "../matmul_module.h" +#include "../matmul_param.h" + +namespace matmul { + +enum class UNIT_FLAG_CTRL : uint8_t { + DISABLE, + RESERVED, + CHECK, + SET, +}; + +template +class MatmulL0CBuffer +{ +public: + template + __aicore__ inline void InitL0CBuffer(uint32_t lenFactor = 1) + { + MATMUL_PARAM_VAR.tpipe_->InitBuffer(MATMUL_PARAM_VAR.CO1_, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT)); + } + + template + __aicore__ inline LocalTensor LoadL0CTensor() + { + return MATMUL_PARAM_VAR.CO1_.template Get(); + } + + template + __aicore__ inline LocalTensor AllocL0CLocalTensor() + { + LocalTensor co1Local; + co1Local = MATMUL_PARAM_VAR.cMatrix_; + return co1Local; + } + + template + __aicore__ inline void FreeL0CLocalTensor(LocalTensor &co1Local) + {} + + __aicore__ inline void ResetL0CEventStatus() + { + event_t eventIDFixToM = static_cast(GetTPipePtr()->FetchEventID(HardEvent::FIX_M)); + SetFlag(eventIDFixToM); + WaitFlag(eventIDFixToM); + } + + __aicore__ inline bool IsL0CLastIter(int l0CIterNum) const + { + return l0CIterNum == MATMUL_CONST_PARAM_VAR.kIter_ - 1; + } + + __aicore__ inline uint8_t GetMmadUnitFlagCtrl(bool isFinalCompute) const + { + return static_cast(isFinalCompute ? UNIT_FLAG_CTRL::SET : UNIT_FLAG_CTRL::CHECK); + } + + template + __aicore__ inline void SetFixpipeUnitFlag(FIX_PIPE_PARAMS& fixpipeParams) + { + fixpipeParams.unitFlag = FIX_PIPE_UNIT_FLAG; + } + +private: + constexpr static uint8_t FIX_PIPE_UNIT_FLAG = 3; +}; + +template +class MatmulL0CBuffer +{ +public: + template + __aicore__ inline void InitL0CBuffer(uint32_t lenFactor = 1) + { + if (MATMUL_PARAM_VAR.tiling_->dbL0C == 2) { + MATMUL_PARAM_VAR.tpipe_->InitBuffer( + MATMUL_PARAM_VAR.CO1_, 2, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT)); + } else { + MATMUL_PARAM_VAR.tpipe_->InitBuffer( + MATMUL_PARAM_VAR.CO1_, 1, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT)); + } + } + + template + __aicore__ inline LocalTensor LoadL0CTensor() + { + return MATMUL_PARAM_VAR.CO1_.template AllocTensor(); + } + + template + __aicore__ inline LocalTensor AllocL0CLocalTensor() + { + LocalTensor co1Local; + MATMUL_PARAM_VAR.CO1_.EnQue(MATMUL_PARAM_VAR.cMatrix_); + co1Local = MATMUL_PARAM_VAR.CO1_.template DeQue(); + return co1Local; + } + + template + __aicore__ inline void FreeL0CLocalTensor(LocalTensor &co1Local) + { + MATMUL_PARAM_VAR.CO1_.FreeTensor(co1Local); + } + + __aicore__ inline void ResetL0CEventStatus() + { + MATMUL_PARAM_VAR.CO1_.FreeAllEvent(); + } + + __aicore__ inline bool IsL0CLastIter(int l0CIterNum) const + { + return false; + } + + __aicore__ inline uint8_t GetMmadUnitFlagCtrl(bool isFinalCompute) const + { + return static_cast(UNIT_FLAG_CTRL::DISABLE); + } + + template + __aicore__ inline void SetFixpipeUnitFlag(FIX_PIPE_PARAMS& fixpipeParams) + {} +}; + +} +#endif // _MATMUL_L0C_BUFFER_H_ \ No newline at end of file diff --git a/impl/matmul/modules/feature_trait/matmul_chip_cap.h b/impl/matmul/modules/feature_trait/matmul_chip_cap.h new file mode 100644 index 00000000..f88bc3e5 --- /dev/null +++ b/impl/matmul/modules/feature_trait/matmul_chip_cap.h @@ -0,0 +1,71 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_chip_cap.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H +#define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H + +namespace matmul { + +class MatmulChipCap +{ +public: + struct Feature { + bool supportUnitFlag; + }; + + __aicore__ constexpr static const Feature& GetFeatures() + { + return features[GetChipType()]; + } + +private: + enum { + CHIP_TYPE_100, + CHIP_TYPE_200, + CHIP_TYPE_220, + CHIP_TYPE_300, + CHIP_TYPE_310, + CHIP_TYPE_MAX, + }; + + __aicore__ inline constexpr static uint8_t GetChipType() + { + #if __CCE_AICORE__ == 100 + return CHIP_TYPE_100; + #elif __CCE_AICORE__ == 200 + return CHIP_TYPE_200; + #elif __CCE_AICORE__ == 220 + return CHIP_TYPE_220; + #elif __CCE_AICORE__ == 300 + return CHIP_TYPE_300; + #elif __CCE_AICORE__ == 310 + return CHIP_TYPE_310; + #else + return CHIP_TYPE_MAX; + #endif + } + +private: + constexpr static Feature features[CHIP_TYPE_MAX] = { + /* supportUnitFlag */ + /*100*/ {false,}, + /*200*/ {false,}, + /*220*/ {true,}, + /*300*/ {true,}, + /*310*/ {true,} + }; +}; + +} +#endif // _MATMUL_CHIP_CAP_H_ \ No newline at end of file diff --git a/impl/matmul/modules/feature_trait/matmul_feature_trait.h b/impl/matmul/modules/feature_trait/matmul_feature_trait.h new file mode 100644 index 00000000..e69100f7 --- /dev/null +++ b/impl/matmul/modules/feature_trait/matmul_feature_trait.h @@ -0,0 +1,40 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_feature.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H +#define IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H + +#include "../../matmul_utils.h" +#include "matmul_chip_cap.h" +#include "matmul_iter_ctrl_cfg.h" + +namespace matmul { + +template +class MatmulFeatureTrait { +public: + static constexpr MatmulIterCtrlCfg iterCtrlCfg { + .isFixedStep = DoMatmulSpecialBasicBlock(MM_CFG), + .stepM = MM_CFG.stepM, + .stepN = MM_CFG.stepN, + .iterOrder = IterateOrder::UNDEF, + }; + + __aicore__ inline constexpr static bool IsUnitFlagEnabled() + { + return EnUnitFlag(MM_CFG) && MatmulChipCap::GetFeatures().supportUnitFlag; + } +}; +} +#endif // _MATMUL_FEATURE_TRAIT_H_ \ No newline at end of file diff --git a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h new file mode 100644 index 00000000..12a7c679 --- /dev/null +++ b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h @@ -0,0 +1,30 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_iter_ctrl_cfg.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H +#define IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H + +#include "../../../../lib/matmul/tiling.h" + +namespace matmul { + +struct MatmulIterCtrlCfg { + bool isFixedStep; + int32_t stepM; + int32_t stepN; + IterateOrder iterOrder; +}; + +} +#endif // _MATMUL_ITER_CTRL_CFG_H_ \ No newline at end of file diff --git a/impl/matmul/modules/input_cache/matmul_input_l1_cache.h b/impl/matmul/modules/input_cache/matmul_input_l1_cache.h new file mode 100644 index 00000000..763c7f71 --- /dev/null +++ b/impl/matmul/modules/input_cache/matmul_input_l1_cache.h @@ -0,0 +1,48 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_input_l1_cache.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_INPUT_L1_CACHE_H +#define IMPL_MATMUL_MODULES_MATMUL_INPUT_L1_CACHE_H + +#include "../matmul_module.h" + +namespace matmul { + +template +class MatmulInputL1Cache +{ +public: + __aicore__ inline void ClearAL1Cache() + { + if constexpr (!PhyPosIsL1(A_TYPE::pos)) { + if (MATMUL_PARAM_VAR.cacheProcA_ > 0) { + MATMUL_PARAM_VAR.qidA1Cache_.FreeTensor(MATMUL_PARAM_VAR.cacheHeadA1_); + MATMUL_PARAM_VAR.cacheProcA_ = 0; + } + } + } + + __aicore__ inline void ClearBL1Cache() + { + if constexpr (!PhyPosIsL1(B_TYPE::pos)) { + if (MATMUL_PARAM_VAR.cacheProcB_ > 0) { + MATMUL_PARAM_VAR.qidB1Cache_.FreeTensor(MATMUL_PARAM_VAR.cacheHeadB1_); + MATMUL_PARAM_VAR.cacheProcB_ = 0; + } + } + } +}; + +} +#endif // _MATMUL_INPUT_L1_CACHE_H_ \ No newline at end of file diff --git a/impl/matmul/modules/iterator/matmul_iterate_controller.h b/impl/matmul/modules/iterator/matmul_iterate_controller.h new file mode 100644 index 00000000..24403997 --- /dev/null +++ b/impl/matmul/modules/iterator/matmul_iterate_controller.h @@ -0,0 +1,124 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_iterate_controller.h + * \brief + */ +#ifndef IMPL_MATMUL_MODULES_MATMUL_ITERATOR_CONTROLLER_H +#define IMPL_MATMUL_MODULES_MATMUL_ITERATOR_CONTROLLER_H + +#include "../matmul_module.h" +#include "../matmul_param.h" +#include "../feature_trait/matmul_iter_ctrl_cfg.h" +#include "../input_cache/matmul_input_l1_cache.h" + +namespace matmul { + +template +class MatmulIterateController +{ + MATMUL_USE_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE); + +public: + __aicore__ inline bool MoveNext() + { + if (unlikely(MATMUL_PARAM_VAR.isFirstIter_)) { + return MoveOnFirstIterate(); + } + if constexpr (ITER_CFG.iterOrder == IterateOrder::UNDEF) { + auto& var = MATMUL_PARAM_VAR; + if (likely(var.tiling_->iterateOrder == static_cast(IterateOrder::ORDER_M))) { + return MoveOnIterateOrderM(); + } else { + ASCENDC_ASSERT((var.tiling_->iterateOrder == static_cast(IterateOrder::ORDER_N)), { + KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", + var.tiling_->iterateOrder); + }); + return MoveOnIterateOrderN(); + } + } else if (ITER_CFG.iterOrder == IterateOrder::ORDER_M) { + return MoveOnIterateOrderM(); + } else { + return MoveOnIterateOrderN(); + } + } + + __aicore__ inline void Reset() { + MATMUL_PARAM_VAR.isFirstIter_ = true; + } + +private: + __aicore__ inline bool MoveOnFirstIterate() + { + auto& var = MATMUL_PARAM_VAR; + var.isFirstIter_ = false; + var.curM_ = 0; + var.curN_ = 0; + var.stepMIdx_ = 0; + var.stepNIdx_ = 0; + var.curStepM_ = + (var.mIter_ - var.curM_) > var.tiling_->stepM ? + var.tiling_->stepM : (var.mIter_ - var.curM_); + var.curStepN_ = + (var.nIter_ - var.curN_) > var.tiling_->stepN ? + var.tiling_->stepN : (var.nIter_ - var.curN_); + return true; + } + + __aicore__ inline bool MoveOnIterateOrderM() + { + auto& var = MATMUL_PARAM_VAR; + // Output along M axis + if (++var.curN_ >= var.stepNIdx_ + var.curStepN_) { + MATMUL_MODULE(MatmulInputL1Cache).ClearAL1Cache(); + var.curN_ = var.stepNIdx_; + if (++var.curM_ >= var.mIter_) { + MATMUL_MODULE(MatmulInputL1Cache).ClearBL1Cache(); + var.curM_ = 0; + var.stepNIdx_ += var.curStepN_; + if (var.stepNIdx_ >= var.nIter_) { + return false; + } + var.curN_ = var.stepNIdx_; + var.curStepN_ = + (var.nIter_ - var.curN_) > var.tiling_->stepN ? + var.tiling_->stepN : (var.nIter_ - var.curN_); + } + } + return true; + } + + __aicore__ inline bool MoveOnIterateOrderN() + { + auto& var = MATMUL_PARAM_VAR; + if (++var.curM_ >= var.stepMIdx_ + var.curStepM_) { + MATMUL_MODULE(MatmulInputL1Cache).ClearBL1Cache(); + var.curM_ = var.stepMIdx_; + if (++var.curN_ >= var.nIter_) { + MATMUL_MODULE(MatmulInputL1Cache).ClearAL1Cache(); + var.curN_ = 0; + var.stepMIdx_ += var.curStepM_; + if (var.stepMIdx_ >= var.mIter_) { + return false; + } + var.curM_ = var.stepMIdx_; + var.curStepM_ = + (var.mIter_ - var.curM_) > var.tiling_->stepM ? + var.tiling_->stepM : (var.mIter_ - var.curM_); + } + } + return true; + } +}; + +} + +#endif \ No newline at end of file diff --git a/impl/matmul/modules/matmul_module.h b/impl/matmul/modules/matmul_module.h new file mode 100644 index 00000000..dd151420 --- /dev/null +++ b/impl/matmul/modules/matmul_module.h @@ -0,0 +1,41 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_entity_macro.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_ENTITY_MACRO_H +#define IMPL_MATMUL_MODULES_MATMUL_ENTITY_MACRO_H + +#define MATMUL_ENTITY ENTITY + +#define MATMUL_ENTITY_IMPL_TYPE MatmulImpl + +#define MATMUL_IMPORT_MODULE(NAME, ...) private NAME + +#define MATMUL_MODULE_NAME(NAME) NAME##Module + +#define MATMUL_USE_MODULE_OF(NAME, ENTITY, ...) using MATMUL_MODULE_NAME(NAME) = NAME +#define MATMUL_USE_MODULE(NAME, ...) MATMUL_USE_MODULE_OF(NAME, MATMUL_ENTITY, ##__VA_ARGS__) + +#define MATMUL_USE_IMPORTED_MODULE(NAME, ...) \ +MATMUL_USE_MODULE_OF(NAME, MATMUL_ENTITY_IMPL_TYPE, ##__VA_ARGS__); \ +friend class NAME + +#define MATMUL_PARAMS_OF(ENTITY) static_cast(this)->var +#define MATMUL_PARAM_VAR MATMUL_PARAMS_OF(MATMUL_ENTITY) + +#define MATMUL_CONST_PARAMS_OF(ENTITY) ((const ENTITY*)(this))->var +#define MATMUL_CONST_PARAM_VAR MATMUL_CONST_PARAMS_OF(MATMUL_ENTITY) + +#define MATMUL_MODULE(NAME) (*static_cast(static_cast(this))) + +#endif // _MATMUL_ENTITY_MACRO_H_ \ No newline at end of file diff --git a/impl/matmul/modules/matmul_params.h b/impl/matmul/modules/matmul_params.h new file mode 100644 index 00000000..d114f1b8 --- /dev/null +++ b/impl/matmul/modules/matmul_params.h @@ -0,0 +1,470 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_impl.h + * \brief + */ +#ifndef IMPL_MATMUL_MODULES_PARAMS_H +#define IMPL_MATMUL_MODULES_PARAMS_H + +#include "kernel_macros.h" + +#include "lib/matmul/tiling.h" +#include "kernel_operator.h" +#include "../matmul_utils.h" +#include "matmul_type_def.h" + +namespace matmul { +/* ************************************************************************************************** + * MatmulParamsBase * + * ************************************************************************************************* */ +template +struct MatmulParamsBase { + __aicore__ inline MatmulParamsBase() {}; +}; + +template +struct MatmulParamsNorm : public MatmulParamsBase { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulParamsNorm() {}; + using SrcT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + TQue qidBias_; + typename L0cType::BUFFER CO1_; +#if __CCE_AICORE__ < 220 + TQue qidA2_; + TQue qidB2_; + TQue qidVecIn_; + TQue qidCO2_; + + typename QidType::QUE qidA1_; + typename QidType::QUE qidB1_; + typename QidType::QUE qidA1Cache_; + typename QidType::QUE qidB1Cache_; +#else + TQue qidA1_; + TQue qidB1_; + TQue qidA1Cache_; + TQue qidB1Cache_; +#endif + + LocalTensor cMatrix_; + + LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ + LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ + LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ + + SrcT aScalar_; + SrcT bScalar_; + DEBUG_CODE(int calCount_ = 0); + + TBuffAddr leftMatrix_; + TBuffAddr rightMatrix_; + TBuffAddr inputBias_; + + __gm__ SrcT* aGlobal_; + __gm__ SrcBT* bGlobal_; + __gm__ BiasT* biasGlobal_; + + TPipe* tpipe_; + const TCubeTiling* __restrict tiling_; + __gm__ uint8_t* cacheWorkspaceAddr; + +#if __CCE_AICORE__ < 220 + __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; + LocalTensor localWorkspace; + int nd2nz0ffset = 0; + int transOffset = 0; + int co2Offset = 0; +#endif + + int singleCoreM_; + int singleCoreN_; + int singleCoreK_; + // iterate nums in mnk axis + int mIter_; + int nIter_; + int kIter_; + + // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases + // measured in element + int baseUseM_; + int baseUseK_; + int baseUseN_; + // measured in cube block + int blockUseM_; + int blockUseK_; + int blockUseN_; + + int32_t cacheProcA_, cacheProcB_; + bool isFirstIter_; + bool isTransposeA_; // whether A matrix need to transpose + bool isTransposeB_; // whether B matrix need to transpose + // whether enbale bias, default value is false + bool enableBias_; + + int tailM_, tailK_, tailN_; + // current c matrix coordinate + int curM_, curN_; + // current c matrix step size, there could be tail steps + int curStepM_, curStepN_; + // current c matrix step block coordinate + int stepMIdx_, stepNIdx_; + + bool enHF32Mode_; + int32_t hf32TransMode_; + uint8_t subBlockIdx_; + + int baseMK_; + int baseKN_; + int baseMN_; + + int cacheA1Size_, cacheB1Size_; + int depthA1_, depthB1_; +#if __CCE_AICORE__ >= 220 + int sMadMStep_ = 0; + int sMadNStep_ = 0; +#endif + uint64_t dataPtr_; + uint64_t tilingPtr_; +}; + +template +struct MatmulParamsNormQuant : public MatmulParamsNorm { + __aicore__ inline MatmulParamsNormQuant() {}; + TQue qidFixPipe_; + uint64_t quantScalar_ = 0; + GlobalTensor quantTensor_; + // 0: no quant, 1: deqf16, 2: vdeqf16, 3: QF322B8_PRE, 4: VQF322B8_PRE, 5: REQ8(s32->u8/s8), 6: VREQ8(s32->u8/s8) + uint8_t quantMode_ = 0; +}; + +template +struct MatmulParamsMDL : public MatmulParamsBase { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulParamsMDL() {}; + using SrcT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + + TQue qidBias_; + TQue qidFixPipe_; + typename L0cType::BUFFER CO1_; + TQue qidA1_; + TQue qidB1_; +#if __CCE_AICORE__ < 220 + TQue qidA2_; + TQue qidB2_; + TQue qidVecIn_; + TQue qidCO2_; + + typename QidType::QUE qidA12UBCache_; + typename QidType::QUE qidB12UBCache_; +#endif + + LocalTensor cMatrix_; + + LocalTensor cacheA1Ping_; + LocalTensor cacheA1Pong_; + LocalTensor cacheB1Ping_; + LocalTensor cacheB1Pong_; + bool cacheA1IsCachingPing_; + bool cacheA1IsCachingPong_; + bool cacheB1IsCachingPing_; + bool cacheB1IsCachingPong_; + + DEBUG_CODE(int calCount_ = 0); + + TBuffAddr leftMatrix_; + TBuffAddr rightMatrix_; + TBuffAddr inputBias_; + + __gm__ SrcT* aGlobal_; + __gm__ SrcBT* bGlobal_; + __gm__ BiasT* biasGlobal_; + + TPipe* tpipe_; + const TCubeTiling* __restrict tiling_; + __gm__ uint8_t* cacheWorkspaceAddr; + +#if __CCE_AICORE__ < 220 + __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr; + LocalTensor localWorkspace; + LocalTensor cacheHeadA12UB_; // Allocate and release using qidA12UBCache_ + LocalTensor cacheHeadB12UB_; // Allocate and release using qidB12UBCache_ + int nd2nz0ffset = 0; + int transOffset = 0; + int co2Offset = 0; + int32_t cacheA12UBProcA_ = 0; + int32_t cacheB12UBProcB_ = 0; +#endif + + int singleCoreM_; + int singleCoreN_; + int singleCoreK_; + // iterate nums in mnk axis + int mIter_; + int nIter_; + int kIter_; + // iterate nums in mn step axis + int mStepIter_; + int nStepIter_; + int kaStepIter_; + int kbStepIter_; + int kStepIter_; + int minStepK_; + int kaStepFactor_; + int kbStepFactor_; + + // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases + // in unit of element + int baseUseM_; + int baseUseK_; + int baseUseN_; + // in unit of cube block + int blockUseM_; + int blockUseK_; + int blockUseN_; + + // in unit of element + int baseUseStepM_; + int baseUseStepN_; + int baseUseStepKa_; + int baseUseStepKb_; + // in unit of cube block + int blockUseStepM_; + int blockUseStepN_; + int blockUseStepKa_; + int blockUseStepKb_; + + bool isFirstIter_; + bool isTransposeA_; // whether A matrix need to transpose + bool isTransposeB_; // whether B matrix need to transpose + // whether enbale bias, default value is false + bool enableBias_; + + // in unit of element + int tailM_, tailK_, tailN_; + // in unit of element + int tailStepM_, tailStepN_, tailStepKa_, tailStepKb_; + // current c matrix coordinate, in unit of baseMN + int curM_, curN_; + // current c matrix step size, in unit of baseMNK , there could be tail steps + int curStepM_, curStepN_; + // current c matrix step block coordinate, in unit of stepMNK + int stepMIdx_, stepNIdx_, stepKaIdx_, stepKbIdx_; + + // stepKa == kIter + bool isA1KFullLoad_, isB1KFullLoad_; + + bool enHF32Mode_; + int32_t hf32TransMode_; + uint8_t subBlockIdx_; + + int baseMK_; + int baseKN_; + int baseMN_; + int cacheA1Factor_, cacheB1Factor_; + uint64_t quantScalar_ = 0; + uint64_t dataPtr_; + uint64_t tilingPtr_; + GlobalTensor quantTensor_; + // 0: no quant, 1: deqf16, 2: vdeqf16; + uint8_t quantMode_ = 0; + // anti quant param. + SrcT antiQuantOffsetScalar_; + SrcT antiQuantScaleScalar_; + LocalTensor antiQuantOffsetTensor_; + LocalTensor antiQuantScaleTensor_; +}; + +template +struct MatmulParamsBasicBlock : public MatmulParamsNorm { + __aicore__ inline MatmulParamsBasicBlock() {}; +}; + +template +struct MatmulParamsIBShareNorm : public MatmulParamsBase { + using L0cT = typename GetDstType::Type; + __aicore__ inline MatmulParamsIBShareNorm() {}; + using SrcT = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + TQue qidBias_; + typename L0cType::BUFFER CO1_; + + TQue qidA2_; + TQue qidB2_; + TQue qidVecIn_; + TQue qidCO2_; + + typename QidType::QUE qidA1_; + typename QidType::QUE qidA1Cache_; + typename QidType::QUE qidB1_; + typename QidType::QUE qidB1Cache_; + + LocalTensor cMatrix_; + + LocalTensor cacheHeadA1_; // Allocate and release using qidA1Cache_ + LocalTensor cacheHeadB1_; // Allocate and release using qidB1Cache_ + LocalTensor cacheHeadBias_; // Allocate and release using qidBias_ + + SrcT aScalar_; + SrcT bScalar_; + DEBUG_CODE(int calCount_ = 0); + + TBuffAddr leftMatrix_; + TBuffAddr rightMatrix_; + TBuffAddr inputBias_; + + __gm__ SrcT* aGlobal_; + __gm__ SrcT* bGlobal_; + __gm__ BiasT* biasGlobal_; + + TPipe* tpipe_; + const TCubeTiling* __restrict tiling_; + __gm__ uint8_t* cacheWorkspaceAddr; + + int singleCoreM_; + int singleCoreN_; + int singleCoreK_; + // iterate nums in mnk axis + int mIter_; + int nIter_; + int kIter_; + + // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases + // measured in element + int baseUseM_; + int baseUseK_; + int baseUseN_; + // measured in cube block + int blockUseM_; + int blockUseK_; + int blockUseN_; + + int32_t cacheProcA_, cacheProcB_; + bool isFirstIter_; + bool isTransposeA_; // whether A matrix need to transpose + bool isTransposeB_; // whether B matrix need to transpose + // whether enbale bias, default value is false + bool enableBias_; + + int tailM_, tailK_, tailN_; + // current c matrix coordinate + int curM_, curN_; + // current c matrix step size, there could be tail steps + int curStepM_, curStepN_; + // current c matrix step block coordinate + int stepMIdx_, stepNIdx_; + + bool enHF32Mode_; + int32_t hf32TransMode_; + uint8_t subBlockIdx_; + + int baseMK_; + int baseKN_; + int baseMN_; + + int cacheA1Size_, cacheB1Size_; + int depthA1_, depthB1_; + uint64_t dataPtr_; + uint64_t tilingPtr_; + + int curCacheIdx_; + GlobalCache gL1GroupCache0_; + GlobalCache gL1GroupCache1_; +}; + +/* ************************************************************************************************** + * MatmulParams * + * ************************************************************************************************* */ +template +struct MatmulParams { + __aicore__ inline MatmulParams(){}; +}; + +// CFG_NORM +#if __CCE_AICORE__ >= 220 +template +struct MatmulParams::value || + IsSameType::value) && + IsSameType::value) || + (IsSameType::value && + (IsSameType::value || + IsSameType::value)))>::type> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsNorm; +}; +#else +template +struct MatmulParams::value && IsSameType::value) || + (IsSameType::value && IsSameType::value))>::type> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsNorm; +}; +#endif + +#if __CCE_AICORE__ >= 220 +template +struct MatmulParams::value || + IsSameType::value) && + IsSameType::value) || + (IsSameType::value && + (IsSameType::value || + IsSameType::value)))>::type> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsNormQuant; +}; +#else +template +struct MatmulParams::value && IsSameType::value) || + (IsSameType::value && IsSameType::value))>::type> { + __aicore__ inline MatmulParams(){}; + using PARAMS = MatmulParamsNormQuant; +}; +#endif + +// CFG_MDL +template +struct MatmulParams { + __aicore__ inline MatmulParams() {}; + using PARAMS = MatmulParamsMDL; +}; + +// MM_CFG_BB +template +struct MatmulParams { + __aicore__ inline MatmulParams() {}; + using PARAMS = MatmulParamsBasicBlock; +}; + +// CFG_IBSHARE_NORM +template +struct MatmulParams { + __aicore__ inline MatmulParams() {}; + using PARAMS = MatmulParamsIBShareNorm; +}; + +} + +#endif \ No newline at end of file diff --git a/impl/matmul/modules/matmul_type_def.h b/impl/matmul/modules/matmul_type_def.h new file mode 100644 index 00000000..88992530 --- /dev/null +++ b/impl/matmul/modules/matmul_type_def.h @@ -0,0 +1,33 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_type_def.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H +#define IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H + +#include "lib/matmul/tiling.h" + +namespace matmul { +template +struct MatmulType { + constexpr static TPosition pos = POSITION; + constexpr static CubeFormat format = FORMAT; + using T = TYPE; + constexpr static bool isTrans = ISTRANS; + constexpr static LayoutMode layout = LAYOUT; + constexpr static bool ibShare = IBSHARE; +}; + +} +#endif // _MATMUL_TYPE_DEF_H_ \ No newline at end of file -- Gitee From 0a84b319e90f0cc78162bb2b78149980c6c2c999 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 06:39:38 +0000 Subject: [PATCH 4/8] rename impl/matmul/modules/matmul_params.h to impl/matmul/modules/matmul_param.h. Signed-off-by: jiangchengcheng-on --- impl/matmul/modules/{matmul_params.h => matmul_param.h} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename impl/matmul/modules/{matmul_params.h => matmul_param.h} (100%) diff --git a/impl/matmul/modules/matmul_params.h b/impl/matmul/modules/matmul_param.h similarity index 100% rename from impl/matmul/modules/matmul_params.h rename to impl/matmul/modules/matmul_param.h -- Gitee From e14d1c3fce56310bc60d63ea47c569a1975dbf64 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 06:55:32 +0000 Subject: [PATCH 5/8] del fmod not support Signed-off-by: jiangchengcheng-on --- tests/tiling/test_tiling.cpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index d9cfa951..f2f78fe3 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -1194,28 +1194,6 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf) EXPECT_EQ(extraBuf, 0); } -TEST_F(TestTiling, TestFmodTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto fmodShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFmodMaxMinTmpSize(fmodShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 256); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); -} - -TEST_F(TestTiling, TestFmodTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto fmodShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFmodMaxMinTmpSize(fmodShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 128 * 3 * 4); - EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); -} - TEST_F(TestTiling, TestTruncTilingFloat) { std::vector shapeDims = { 128, 128 }; -- Gitee From c9e41e9946875ae6c0ff11157dee69841fdea7a1 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 07:04:11 +0000 Subject: [PATCH 6/8] add llt macro def Signed-off-by: jiangchengcheng-on --- tests/matmul/matmul_module_test_def.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tests/matmul/matmul_module_test_def.h diff --git a/tests/matmul/matmul_module_test_def.h b/tests/matmul/matmul_module_test_def.h new file mode 100644 index 00000000..8448a21d --- /dev/null +++ b/tests/matmul/matmul_module_test_def.h @@ -0,0 +1,25 @@ +/** +* Copyright (c) 2024 Huawei Technologies Co., Ltd. +* This file is a part of the CANN Open Software. +* Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +* Please refer to the License for details. You may not use this file except in compliance with the License. +* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +* See LICENSE in the root of the software repository for the full text of the License. +*/ + +/*! +* \file matmul_module_test_def.h +* \brief +*/ +#ifndef IMPL_MATMUL_MODULES_MATMUL_MODULE_TEST_DEF_H +#define IMPL_MATMUL_MODULES_MATMUL_MODULE_TEST_DEF_H + + +#ifdef MATMUL_IMPORT_MODULE +#undef MATMUL_IMPORT_MODULE +#define MATMUL_IMPORT_MODULE(NAME, ...) public NAME +#endif + + +#endif // _MATMUL_MODULE_TEST_DEF_H_ \ No newline at end of file -- Gitee From f0733f1e511d3d65f5524262bbaee2d0a76c0ef6 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 07:27:49 +0000 Subject: [PATCH 7/8] restore for tiling api Signed-off-by: jiangchengcheng-on --- tests/tiling/test_tiling.cpp | 1924 +--------------------------------- 1 file changed, 14 insertions(+), 1910 deletions(-) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index f2f78fe3..219b8605 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -77,25 +77,6 @@ TEST_F(TestTiling, PlatformConstructor) EXPECT_EQ(ret, 0); } -TEST_F(TestTiling, TestInt4BaseK) -{ - matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288, - .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536}; - MatmulApiTiling tiling(plat); - tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); - tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); - tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); - tiling.SetBias(false); - tiling.SetShape(144, 256, 32); - tiling.SetOrgShape(144, 256, 32); - tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1); - optiling::TCubeTiling tilingData; - int ret = tiling.GetTiling(tilingData); - tiling.PrintTilingData(); - EXPECT_EQ(tilingData.get_baseK() % 64, 0); - EXPECT_EQ(ret, 0); -} - TEST_F(TestTiling, Tiling_310p_NotAligned) { matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576, @@ -742,105 +723,6 @@ TEST_F(TestTiling, TestSetBufferSpace) EXPECT_EQ(tiling.bufferPool_.l1Size, 1024); } -TEST_F(TestTiling, TestCosTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto cosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 3); - AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 2); - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetCosTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestCosTilingFloat512) -{ - std::vector shapeDims = { 512 }; - auto cosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 3); - AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestCosTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto cosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetCosMaxMinTmpSize(cosShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 8 * 2); - EXPECT_EQ(minValue, 256 * 8); - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetCosTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 8); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAtanTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto atanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAtanMaxMinTmpSize(atanShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 5); - EXPECT_EQ(minValue, 256 * 5); -} - -TEST_F(TestTiling, TestAtanTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto atanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAtanMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 12); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 12); - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetAtanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 12); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestClampTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto atanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetClampMaxMinTmpSize(atanShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 1); - EXPECT_EQ(minValue, 64 * 1); -} - -TEST_F(TestTiling, TestClampTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto atanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetClampMaxMinTmpSize(atanShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 1); - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - GetClampTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - TEST_F(TestTiling, TestSoftMaxTiling) { std::vector shapeDims = { 128, 128 }; @@ -954,6 +836,7 @@ TEST_F(TestTiling, TestSoftMaxFlashV2Tiling) SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true); EXPECT_EQ(tilingData.get_reduceM(), 64); } + TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) { std::vector shapeDims = { 8, 1024 }; @@ -982,163 +865,6 @@ TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock) EXPECT_EQ(tilingData.get_reduceM(), 8); } -TEST_F(TestTiling, TestAsinTmpBufferFacotrHalfWithoutBasicBlock) { - uint32_t maxLivedNodes = 0xffff; - uint32_t extraBuffer = 0xffff; - GetAsinTmpBufferFactorSize(2, maxLivedNodes, extraBuffer); - EXPECT_EQ(maxLivedNodes, 6); - EXPECT_EQ(extraBuffer, 0); -} - -TEST_F(TestTiling, TestAsinTmpBufferFacotrFloatWithoutBasicBlock) { - uint32_t maxLivedNodes = 0xffff; - uint32_t extraBuffer = 0xffff; - GetAsinTmpBufferFactorSize(4, maxLivedNodes, extraBuffer); - EXPECT_EQ(maxLivedNodes, 2); - EXPECT_EQ(extraBuffer, 0); -} - -TEST_F(TestTiling, TestAsinTilingHalf128) -{ - std::vector shapeDims = { 128 }; - auto asinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 256 * 6); - EXPECT_EQ(minValue, 256 * 6); -} - -TEST_F(TestTiling, TestAsinTilingFloat) -{ - std::vector shapeDims = { 32 }; - auto asinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 256 * 2); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestAsinTilingHalf16K) -{ - std::vector shapeDims = { 128, 128 }; - auto asinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 6 * 2); - EXPECT_EQ(minValue, 256 * 6); -} - -TEST_F(TestTiling, TestAsinTilingFloat16K) -{ - std::vector shapeDims = { 128, 128 }; - auto asinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestSinhTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto sinhShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetSinhMaxMinTmpSize(sinhShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetSinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSinhTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto sinhShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetSinhMaxMinTmpSize(sinhShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 4); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetSinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestRoundTiling) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - std::vector shapeDims = { 128, 128 }; - auto tanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - uint32_t maxLiveNodeCnt = 0; - uint32_t extraBuf = 0; - platform_ascendc::SocVersion socVersion = plat.GetSocVersion(); - GetRoundMaxMinTmpSize(plat, tanShape, 4, false, maxValue, minValue); - GetRoundTmpBufferFactorSize(plat, 4, maxLiveNodeCnt, extraBuf); - GetRoundMaxMinTmpSize(plat, tanShape, 2, false, maxValue, minValue); - GetRoundTmpBufferFactorSize(plat, 2, maxLiveNodeCnt, extraBuf); -} - -TEST_F(TestTiling, TestTanTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto tanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 4); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestTanTilingFloat512) -{ - std::vector shapeDims = { 512 }; - auto tanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 4); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestTanTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto tanShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTanMaxMinTmpSize(tanShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 10 * 2); - EXPECT_EQ(minValue, 256 * 10); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetTanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 10); - EXPECT_EQ(extraBuf, 0); -} - TEST_F(TestTiling, TEstSwiGLUTilingHalf) { std::vector shapeDims = {10, 512}; @@ -1194,110 +920,6 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf) EXPECT_EQ(extraBuf, 0); } -TEST_F(TestTiling, TestTruncTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto truncShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTruncMaxMinTmpSize(truncShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetTruncTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestTruncTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto truncShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetTruncTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestTruncTilingHalf512) -{ - std::vector shapeDims = { 512 }; - auto truncShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 2 * 2); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestAcosTmpBufferFacotrHalfWithoutBasicBlock) { - uint32_t maxLivedNodes = 0xffff; - uint32_t extraBuffer = 0xffff; - GetAcosTmpBufferFactorSize(2, maxLivedNodes, extraBuffer); - EXPECT_EQ(maxLivedNodes, 6); - EXPECT_EQ(extraBuffer, 0); -} - - -TEST_F(TestTiling, TestAcosTmpBufferFacotrFloatWithoutBasicBlock) { - uint32_t maxLivedNodes = 0xffff; - uint32_t extraBuffer = 0xffff; - GetAcosTmpBufferFactorSize(4, maxLivedNodes, extraBuffer); - EXPECT_EQ(maxLivedNodes, 2); - EXPECT_EQ(extraBuffer, 0); -} - -TEST_F(TestTiling, TestAcosTilingHalf128) -{ - std::vector shapeDims = { 128 }; - auto acosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 6); - EXPECT_EQ(maxValue, 256 * 6); -} - -TEST_F(TestTiling, TestAcosTilingFloat) -{ - std::vector shapeDims = { 32 }; - auto acosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); - EXPECT_EQ(maxValue, 256 * 2); -} - -TEST_F(TestTiling, TestTanhTiling) -{ - uint32_t maxVal = 0; - uint32_t minVal = 0; - GetTanhMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 128 * 4 * 1); - EXPECT_EQ(minVal, 256 * 1); - GetTanhMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 4); - EXPECT_EQ(minVal, 256 * 4); - uint32_t extraBuf = 123; - uint32_t maxLivedNodesCnt = 123; - GetTanhTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf); - EXPECT_EQ(extraBuf, 0); - EXPECT_EQ(maxLivedNodesCnt, 1); - GetTanhTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf); - EXPECT_EQ(extraBuf, 0); - EXPECT_EQ(maxLivedNodesCnt, 4); -} - TEST_F(TestTiling, TestSigmoidTiling) { std::vector shapeDims = { 128 }; @@ -1309,329 +931,28 @@ TEST_F(TestTiling, TestSigmoidTiling) EXPECT_EQ(minVal, 256); } -TEST_F(TestTiling, TestLogTilingMaxMin) -{ - std::vector shapeDims = { 128 }; - auto logShape = ge::Shape(shapeDims); - uint32_t maxVal; - uint32_t minVal; - GetLogMaxMinTmpSize(logShape, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetLog2MaxMinTmpSize(logShape, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetLog2MaxMinTmpSize(logShape, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 4 * 128); - EXPECT_EQ(minVal, 256); - GetLog10MaxMinTmpSize(logShape, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); -} - -TEST_F(TestTiling, TestLogTilingFactor) -{ - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetLogTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetLog10TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); - GetLog2TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); - GetLog2TmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestPowerTiling) -{ - std::vector shapeDims = { 512 }; - auto powerShape = ge::Shape(shapeDims); - uint32_t maxVal; - uint32_t minVal; - GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 4 + 256); - EXPECT_EQ(minVal, 256 * 4 + 256); - GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 6); - EXPECT_EQ(minVal, 256 * 6); - GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 2 * 14 + 256); - EXPECT_EQ(minVal, 256 * 7 + 256); - std::vector scalar_shape = { 1 }; - auto scalarShape = ge::Shape(scalar_shape); - GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 2 * 14 + 256); - EXPECT_EQ(minVal, 256 * 7 + 256); - GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 7); - EXPECT_EQ(minVal, 256 * 7); - GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 512 * 4 * 5 + 256); - EXPECT_EQ(minVal, 256 * 5 + 256); - - std::vector shape1 = { 16 }; - auto powerShape1 = ge::Shape( shape1 ); - GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 5 + 256); - EXPECT_EQ(minVal, 256 * 5 + 256); - GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 7 + 256); - EXPECT_EQ(minVal, 256 * 7 + 256); - GetPowerMaxMinTmpSize(powerShape1, scalarShape, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 7); - EXPECT_EQ(minVal, 256 * 7); - GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 4 + 256); - EXPECT_EQ(minVal, 256 * 4 + 256); - GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 7 + 256); - EXPECT_EQ(minVal, 256 * 7 + 256); - GetPowerMaxMinTmpSize(powerShape1, powerShape1, true, 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 6); - EXPECT_EQ(minVal, 256 * 6); -} - -TEST_F(TestTiling, TestPowerTilingFactorSize) +TEST_F(TestTiling, TestLayernormTiling) { - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 5); - EXPECT_EQ(extraBuf, 256); - GetPowerTmpBufferFactorSize(false, true, true, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 7); - EXPECT_EQ(extraBuf, 0); - GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 14); - EXPECT_EQ(extraBuf, 256); - GetPowerTmpBufferFactorSize(true, true, false, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 256); - GetPowerTmpBufferFactorSize(true, true, true, 4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 6); - EXPECT_EQ(extraBuf, 0); - GetPowerTmpBufferFactorSize(true, true, false, 2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 14); - EXPECT_EQ(extraBuf, 256); -} + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 4; -TEST_F(TestTiling, TestAcosTilingHalf16K) -{ - std::vector shapeDims = { 128, 128 }; - auto acosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 6 * 2); - EXPECT_EQ(minValue, 256 * 6); -} + std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; + auto layernormShape = ge::Shape(shapeDims); + const bool isReuseSource = false; + optiling::LayerNormTiling tilling; -TEST_F(TestTiling, TestAcosTilingFloat16K) -{ - std::vector shapeDims = { 128, 128 }; - auto acosShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; uint32_t minValue = 0; - GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestAsinhTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto asinhShape = ge::Shape(shapeDims); uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAsinhMaxMinTmpSize(asinhShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 3); - EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); - AscendC::GetAsinhMaxMinTmpSize(ge::Shape({32}), 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 3); - EXPECT_EQ(maxValue, 256 * 3); + AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue); + EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize); + EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetAsinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); + AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling); + EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); } -TEST_F(TestTiling, TestAsinhTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto asinhShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAsinhMaxMinTmpSize(asinhShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 3); - EXPECT_EQ(maxValue, 128 * 128 * 3 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetAsinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAcoshTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto acoshShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAcoshMaxMinTmpSize(acoshShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetAcoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAcoshTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto acoshShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAcoshMaxMinTmpSize(acoshShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetAcoshTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxis) -{ - const auto shape = ge::Shape({ 8, 128 }); - const auto scalarShape = ge::Shape({1}); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 8 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 8 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisLargeShape) -{ - const auto shape = ge::Shape({ 128, 128 }); - const auto scalarShape = ge::Shape({1}); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 4096 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 4096 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisSmallShape) -{ - const auto shape = ge::Shape({ 1, 16 }); - const auto scalarShape = ge::Shape({1}); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 1024); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 1024); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxis) -{ - const auto srcShape = ge::Shape({ 8, 128 }); - const auto scalarShape = ge::Shape({1}); - const auto maskShape = ge::Shape({ 8, 160 }); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 8 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 8 * 2 + 512 + 8 * 128); - EXPECT_EQ(maxValue, 128 * 8 * 2 + 512 + 8 * 128); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 128 * 8 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 8 * 2 + 512); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisLargeShape) -{ - const auto srcShape = ge::Shape({ 128, 128 }); - const auto scalarShape = ge::Shape({1}); - const auto maskShape = ge::Shape({ 128, 160 }); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 4096 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 4096 * 2 + 512 + 128 * 128); - EXPECT_EQ(maxValue, 128 * 128 * 2 + 512 + 128 * 128); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 4096 * 2 + 512); - EXPECT_EQ(maxValue, 128 * 128 * 2 + 512); -} - -TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisSmallShape) -{ - const auto srcShape = ge::Shape({ 1, 16 }); - const auto scalarShape = ge::Shape({1}); - const auto maskShape = ge::Shape({ 1, 32 }); - uint32_t maxValue; - uint32_t minValue; - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 1024); - GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue); - EXPECT_EQ(minValue, 1024 + 32); - EXPECT_EQ(maxValue, 1024 + 32); - GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 1024); -} - -TEST_F(TestTiling, TestLayernormTiling) -{ - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = 4; - - std::vector shapeDims = { 128, 128, 128, 128, 128, 128 }; - auto layernormShape = ge::Shape(shapeDims); - const bool isReuseSource = false; - optiling::LayerNormTiling tilling; - - uint32_t minValue = 0; - uint32_t maxValue = 0; - - AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue); - EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize); - EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize); - - AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling); - EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); -} - -TEST_F(TestTiling, TestRmsnormTiling) +TEST_F(TestTiling, TestRmsnormTiling) { constexpr uint32_t bLength = 4; constexpr uint32_t sLength = 32; @@ -1911,39 +1232,6 @@ TEST_F(TestTiling, TestDeepnormTiling) EXPECT_EQ(tiling.get_oneTmpSize(), 512); } -TEST_F(TestTiling, TestExpTiling) -{ - std::vector shapeDims = {128, 128}; - auto expShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - - // float isReuseSrc = false 3 tmpBuffer - AscendC::GetExpMaxMinTmpSize(expShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 3 * 256); - EXPECT_EQ(maxValue, 3 * 128 * 128 * 4); - // float isReuseSrc = true 2 tmpBuffer - AscendC::GetExpMaxMinTmpSize(expShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 2 * 256); - EXPECT_EQ(maxValue, 2 * 128 * 128 * 4); - // half 4 tmpBuffer - AscendC::GetExpMaxMinTmpSize(expShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 4 * 256); - EXPECT_EQ(maxValue, 4 * 128 * 128 * 4); - AscendC::GetExpMaxMinTmpSize(expShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 4 * 256); - EXPECT_EQ(maxValue, 4 * 128 * 128 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetExpTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); - AscendC::GetExpTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 8); - EXPECT_EQ(extraBuf, 0); -} - TEST_F(TestTiling, TestMatmulApiTilngFactorSplit1) { MatmulApiTiling tiling; @@ -2285,7 +1573,6 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM4) EXPECT_EQ(res, 0); } - TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadCase) { optiling::TCubeTiling tilingData; @@ -2327,59 +1614,6 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM5) EXPECT_EQ(res, 0); } -TEST_F(TestTiling, TestConcatTiling) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - const uint32_t elemCount = 128; - - AscendC::GetConcatTmpSize(plat, elemCount, 2); -} - -TEST_F(TestTiling, TestSortTiling) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - const uint32_t elemCount = 128; - - AscendC::GetSortTmpSize(plat, elemCount, 4); -} - -TEST_F(TestTiling, TestUnPadTiling) -{ - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = 4; - - std::vector shapeDims = { 32, 32 }; - auto srcShape = ge::Shape(shapeDims); - optiling::UnPadTiling tiling; - - AscendC::UnPadTilingFunc(srcShape, 0, typeSize, tiling); - AscendC::UnPadTilingFunc(srcShape, stackBufferSize, typeSize, tiling); - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetUnPadMaxMinTmpSize(plat, srcShape, typeSize, maxValue, minValue); -} - -TEST_F(TestTiling, TestPadTiling) -{ - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = 4; - - std::vector shapeDims = { 32, 32}; - std::vector ori_shape_dims = { 32, 31 }; - auto srcShape = ge::Shape(shapeDims); - auto oriSrcShape = ge::Shape(ori_shape_dims); - optiling::PadTiling tiling; - - AscendC::PadTilingFunc(srcShape, oriSrcShape, stackBufferSize, typeSize, tiling); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetPadMaxMinTmpSize(srcShape, typeSize, maxValue, minValue); -} - TEST_F(TestTiling, TestLayernormGradTiling) { const uint32_t stackBufferSize = 100 * 1024; @@ -2431,23 +1665,6 @@ TEST_F(TestTiling, TestLayernormGradBetaTiling) EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize / sizeof(float)); } -TEST_F(TestTiling, TestConfusionTransposeTiling) -{ - const uint32_t stackBufferSize = 100 * 1024; - const uint32_t typeSize = 2; - - std::vector shapeDims = { 1, 2, 64, 32 }; - auto srcShape = ge::Shape(shapeDims); - optiling::ConfusionTransposeTiling tiling; - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 1, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 2, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 3, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 4, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 5, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 6, tiling); - AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 7, tiling); -} - TEST_F(TestTiling, TestMatmulApiTilngL0BNoDB) { MatmulApiTiling tiling; @@ -2784,129 +2001,6 @@ TEST_F(TestTiling, TestMatmulApiTilngSetShapeZero) EXPECT_EQ(ret, -1); } -// #if __CCE_AICORE__ == 200 -// TEST_F(TestTiling, TestPlatformAscendC) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); -// EXPECT_EQ(plat.GetCoreNumVector(), 8); -// EXPECT_EQ(plat.GetCoreNumVector() + plat.GetCoreNumAic() , 18); -// } -// #endif - -// #if __CCE_AICORE__ == 220 -// extern void platfrom_stub_set_num_aic(const char *num); -// extern void platfrom_stub_set_num_aiv(const char *num); -// extern void platfrom_stub_set_num_cub(const char *num); -// extern void platfrom_stub_set_ctl(const char *num); -// extern void platfrom_stub_set_chip_version(const char *num); -// extern void platfrom_stub_set_num(uint32_t num); -// TEST_F(TestTiling, TestPlatformAscendC) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); -// uint64_t ub_size, l1_size, l0; -// uint64_t l2_bw, hbm_bw, bw; -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size); -// EXPECT_EQ(ub_size, 196352); -// EXPECT_EQ(l1_size, 524032); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0); -// EXPECT_EQ(l0, 65536); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0); -// EXPECT_EQ(l0, 65536); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0); -// EXPECT_EQ(l0, 65536 * 2); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw); -// EXPECT_EQ(l2_bw, 110); -// EXPECT_EQ(hbm_bw, 32); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw); -// EXPECT_EQ(plat.GetCoreNum(), 48); -// EXPECT_EQ(plat.GetCoreNumAic(), 24); -// EXPECT_EQ(plat.GetCoreNumAiv(), 48); -// platfrom_stub_set_num_cub("20"); -// EXPECT_EQ(plat.GetCoreNumAic(), 20); -// platfrom_stub_set_num_aiv("40"); -// EXPECT_EQ(plat.GetCoreNumAiv(), 40); -// platfrom_stub_set_ctl("AICore"); -// EXPECT_EQ(plat.GetCoreNumAic(), 24); -// EXPECT_EQ(plat.GetCoreNumAiv(), 24); -// platfrom_stub_set_num_aic("20"); -// EXPECT_EQ(plat.GetCoreNumAic(), 20); -// EXPECT_EQ(plat.GetCoreNumAiv(), 20); -// EXPECT_EQ(bw, 0); -// EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2); - -// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 16 * 1024 * 1024); -// platfrom_stub_set_chip_version("Ascend910"); -// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2 * 1024 * 1024); -// EXPECT_EQ(plat.GetSocVersion(), platform_ascendc::SocVersion::ASCEND910); -// EXPECT_EQ(plat.GetCoreNumVector(), 0); -// } -// #endif - -// #if __CCE_AICORE__ == 300 -// extern void platfrom_stub_set_num_aic(const char *num); -// extern void platfrom_stub_set_num_aiv(const char *num); -// extern void platfrom_stub_set_num_cub(const char *num); -// extern void platfrom_stub_set_ctl(const char *num); -// extern void platfrom_stub_set_chip_version(const char *num); -// extern void platfrom_stub_set_num(uint32_t num); -// TEST_F(TestTiling, TestPlatformAscendC) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); -// uint64_t ub_size, l1_size, l0; -// uint64_t l2_bw, hbm_bw, bw; -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size); -// EXPECT_EQ(ub_size, 248 * 1024); -// EXPECT_EQ(l1_size, 1024 * 1024); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0); -// EXPECT_EQ(l0, 65536); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0); -// EXPECT_EQ(l0, 65536); -// plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0); -// EXPECT_EQ(l0, 65536 * 2); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw); -// EXPECT_EQ(l2_bw, 256); -// EXPECT_EQ(hbm_bw, 17); -// plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw); -// EXPECT_EQ(plat.GetCoreNum(), 1); -// EXPECT_EQ(plat.GetCoreNumAic(), 1); -// EXPECT_EQ(plat.GetCoreNumAiv(), 1); -// platfrom_stub_set_num_cub("1"); -// EXPECT_EQ(plat.GetCoreNumAic(), 1); -// platfrom_stub_set_num_aiv("1"); -// EXPECT_EQ(plat.GetCoreNumAiv(), 1); -// platfrom_stub_set_ctl("AICore"); -// EXPECT_EQ(plat.GetCoreNumAic(), 1); -// EXPECT_EQ(plat.GetCoreNumAiv(), 1); -// platfrom_stub_set_num_aic("2"); -// EXPECT_EQ(plat.GetCoreNumAic(), 2); -// EXPECT_EQ(plat.GetCoreNumAiv(), 2); -// EXPECT_EQ(bw, 0); -// EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1); -// EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2); -// EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2); - -// EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2097152); -// EXPECT_EQ(plat.GetCoreNumVector(), 0); -// } -// #endif - TEST_F(TestTiling, TestMatmulApiTilngInt8Case1) { MatmulApiTiling tiling; @@ -3185,60 +2279,6 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case9) EXPECT_EQ(ret, 0); } -TEST_F(TestTiling, TestErfTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto erfShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetErfMaxMinTmpSize(erfShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); - EXPECT_EQ(minValue, 256 * 3); -} - -TEST_F(TestTiling, TestErfTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto erfShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetErfMaxMinTmpSize(erfShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 8); - EXPECT_EQ(minValue, 256 * 8); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetErfTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 8); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestErfcTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto erfcShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetErfcMaxMinTmpSize(erfcShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 7 * 4); - EXPECT_EQ(minValue, 256 * 7); -} - -TEST_F(TestTiling, TestErfcTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto erfcShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetErfcMaxMinTmpSize(erfcShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 16); - EXPECT_EQ(minValue, 256 * 16); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetErfcTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 16); - EXPECT_EQ(extraBuf, 0); -} - TEST_F(TestTiling, TestMatmulApiTilngInt8Case10) { MultiCoreMatmulTiling tiling; @@ -3317,95 +2357,6 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case13) EXPECT_EQ(ret, 0); } -TEST_F(TestTiling, TestCoshTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto coshShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestCoshTilingFloat512) -{ - std::vector shapeDims = { 512 }; - auto coshShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 4 * 2); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestCoshTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto coshShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCoshMaxMinTmpSize(coshShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 6); - EXPECT_EQ(minValue, 256 * 6); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetCoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 6); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSinTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto sinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetSinMaxMinTmpSize(sinShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 2 * 256); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); - GetSinMaxMinTmpSize(sinShape, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 3 * 256); - EXPECT_EQ(maxValue, 128 * 128 * 3 * 4); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetSinTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSinTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto sinShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetSinMaxMinTmpSize(sinShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 8 * 2); - EXPECT_EQ(minValue, 8 * 256); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetSinTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 8); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAscendSumTiling) -{ - uint32_t n = 8; - uint32_t maxValue; - uint32_t minValue; - GetSumMaxMinTmpSize(n, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 32); - EXPECT_EQ(maxValue, 32); - - maxValue = 0; - minValue = 0; - GetSumMaxMinTmpSize(n, 4, false, maxValue, minValue); - EXPECT_EQ(minValue, 32); - EXPECT_EQ(maxValue, 32); -} - TEST_F(TestTiling, TestAscendSiluTiling) { std::vector shapeDims = { 512 }; @@ -3428,54 +2379,6 @@ TEST_F(TestTiling, TestAscendSwishTiling) EXPECT_EQ(maxValue, 0); } -TEST_F(TestTiling, TestAscendXorTiling) -{ - std::vector shapeDims = { 128, 128 }; - auto xorShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetXorMaxMinTmpSize(xorShape, 2, true, maxValue, minValue); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 2); - EXPECT_EQ(minValue, 1 * 256); - GetXorTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestFracTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto fracShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFracMaxMinTmpSize(fracShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetFracTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestFracTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto fracShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFracMaxMinTmpSize(fracShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 1024); - EXPECT_EQ(maxValue, 131072); - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetFracTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - #if __CCE_AICORE__ == 220 extern void platfrom_stub_set_chip_version(const char *num); TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64) @@ -3958,15 +2861,6 @@ TEST_F(TestTiling, TestTopkTiling_TopKModeSmall310P_HALF) } #endif -TEST_F(TestTiling, TestArithProgression) -{ - uint32_t maxValue; - uint32_t minValue; - GetArithProgressionMaxMinTmpSize(maxValue, minValue); - EXPECT_EQ(maxValue, 0); - EXPECT_EQ(minValue, 0); -} - TEST_F(TestTiling, TestGeGLUTilingFloat) { std::vector shapeDims = { 128, 128 }; @@ -4001,566 +2895,6 @@ TEST_F(TestTiling, TestGeGLUTilingHalf) EXPECT_EQ(extraBuf, 0); } -TEST_F(TestTiling, TestLgammaTilingFp32) -{ - std::vector shapeDims = { 128, 128 }; - auto shape = ge::Shape(shapeDims); - uint32_t maxSize; - uint32_t minSize; - GetLgammaMaxMinTmpSize(shape, 4, true, maxSize, minSize); - EXPECT_EQ(maxSize, 458752); - EXPECT_EQ(minSize, 1792); - - GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); - EXPECT_EQ(maxSize, 524288); - EXPECT_EQ(minSize, 2048); - - shapeDims = { 8 }; - shape = ge::Shape(shapeDims); - GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); - EXPECT_EQ(maxSize, 2048); - EXPECT_EQ(minSize, 2048); - - GetLgammaMaxMinTmpSize(shape, 4, true,maxSize, minSize); - EXPECT_EQ(maxSize, 1792); - EXPECT_EQ(minSize, 1792); - - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - GetLgammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 8); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestLgammaTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto shape = ge::Shape(shapeDims); - uint32_t maxSize; - uint32_t minSize; - - GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 128 * 128 * 2 * 13 * 2); - EXPECT_EQ(minSize, 13 * 2 * 256); - - shapeDims = { 8 }; - shape = ge::Shape(shapeDims); - GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 256 * 13 * 2); - EXPECT_EQ(minSize, 256 * 13 * 2); - - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - GetLgammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 13); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestDigammaTilingFp32) -{ - std::vector shapeDims = { 128, 128 }; - auto shape = ge::Shape(shapeDims); - uint32_t maxSize; - uint32_t minSize; - GetDigammaMaxMinTmpSize(shape, 4, true, maxSize, minSize); - EXPECT_EQ(maxSize, 393216); - EXPECT_EQ(minSize, 1536); - - GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); - EXPECT_EQ(maxSize, 458752); - EXPECT_EQ(minSize, 1792); - - shapeDims = { 8 }; - shape = ge::Shape(shapeDims); - GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize); - EXPECT_EQ(maxSize, 1792); - EXPECT_EQ(minSize, 1792); - - GetDigammaMaxMinTmpSize(shape, 4, true,maxSize, minSize); - EXPECT_EQ(maxSize, 1536); - EXPECT_EQ(minSize, 1536); - - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - GetDigammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 7); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestDigammaTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto shape = ge::Shape(shapeDims); - uint32_t maxSize; - uint32_t minSize; - - GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 128 * 128 * 2 * 8 * 2); - EXPECT_EQ(minSize, 8 * 2 * 256); - - shapeDims = { 8 }; - shape = ge::Shape(shapeDims); - GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 256 * 8 * 2); - EXPECT_EQ(minSize, 256 * 8 * 2); - - uint32_t maxLiveNodeCnt = 0xffff; - uint32_t extraBuf = 0xffff; - GetDigammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 16); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAtanhTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto aTanhShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 4 * 1); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetAtanhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAtanhTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto aTanhShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 4); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - AscendC::GetAtanhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 4); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestSignTiling) -{ - std::vector shapeDims = { 128, 128 }; - auto signShape = ge::Shape(shapeDims); - uint32_t signNeedMaxSize; - uint32_t signNeedMinSize; - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetSignMaxMinTmpSize(signShape, 2, false, signNeedMaxSize, signNeedMinSize); - EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 2); - EXPECT_EQ(signNeedMinSize, 3 * 256); - - GetSignMaxMinTmpSize(signShape, 4, false, signNeedMaxSize, signNeedMinSize); - EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 4); - EXPECT_EQ(signNeedMinSize, 3 * 256); - - GetSignTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 3); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestAscendMeanTiling) -{ - uint32_t n = 8; - uint32_t maxValue; - uint32_t minValue; - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - - GetMeanMaxMinTmpSize(n, 2, 2, true, maxValue, minValue); - EXPECT_EQ(minValue, 32); - EXPECT_EQ(maxValue, 32); - - maxValue = 0; - minValue = 0; - GetMeanMaxMinTmpSize(n, 4, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 32); - EXPECT_EQ(maxValue, 32); - - GetMeanMaxMinTmpSize(n, 2, 4, true, maxValue, minValue); - EXPECT_EQ(minValue, 96); - EXPECT_EQ(maxValue, 96); - - GetMeanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -// TEST_F(TestTiling, TestKernelContextBuildBase) -// { -// auto builder = context_ascendc::BuildKernelRunContext(2, 2); -// EXPECT_EQ(builder.kernelInputNum, 2); -// } - - -// TEST_F(TestTiling, TestKernelContextBuild) -// { -// gert::Shape input1_shape = {2, 1, 1, 1, 1, 1, 1, 2, 2}; -// int32_t input1_tensor_buffer[] = {0, 2, 3, 3, 1, 0, 0, 1}; -// gert::TensorData input1_tensor_data{(void*)input1_tensor_buffer, nullptr}; -// gert::Shape output_shape = {5, 3}; -// int64_t output_tensor_buffer[15]; -// gert::TensorData output_tensor_data{(void*)output_tensor_buffer, nullptr}; -// auto kernelHolder = -// context_ascendc::KernelRunContextBuilder() -// .KernelIONum(2, 2) -// .Inputs({reinterpret_cast(&input1_shape), -// reinterpret_cast(&input1_tensor_data)}) -// .Outputs({reinterpret_cast(&output_shape), reinterpret_cast(&output_tensor_data)}) -// .NodeIoNum(1, 1) -// .IrInputNum(1) -// .NodeInputTd(0, ge::DT_INT32, ge::FORMAT_ND, ge::FORMAT_ND) -// .NodeOutputTd(0, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND) -// .Build(); -// auto context = kernelHolder.GetContext(); -// EXPECT_NE(context, nullptr); -// } - -// TEST_F(TestTiling, TestTilingContextBuildWithConstValue) -// { -// string active_type = "gelu"; -// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; -// gert::StorageShape expert_tokens_shape = {{16}, {16}}; -// gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}}; -// gert::StorageShape bias1_shape = {{16, 0}, {16, 0}}; -// gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}}; -// gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}}; - -// gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}}; - -// std::vector expert_tokens_const_value (16, 1); -// std::vector x_const_value (1024 * 5120, 2.f); -// std::vector bias2_value (16 * 5120, 3.f); -// auto param = gert::TilingData::CreateCap(4096); -// auto workspace_size_holer = gert::ContinuousVector::Create(4096); -// auto ws_size = reinterpret_cast(workspace_size_holer.get()); -// auto holder = context_ascendc::TilingContextBuilder() -// .SetOpNameType("name", "tpye") -// .NodeIoNum(6, 1) -// .IrInstanceNum({1, 1, 1, 1, 1, 1}) -// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) -// .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape) -// .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape) -// .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, reinterpret_cast(expert_tokens_const_value.data())) -// .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape) -// .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast(bias2_value.data())) -// .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape) -// .AddAttrs({ -// {"activation", ge::AnyValue::CreateFrom(active_type)}, -// {"inner_precise", ge::AnyValue::CreateFrom(1)} -// }) -// .TilingData(param.get()) -// .Workspace(ws_size) -// .Build(); - -// gert::TilingContext* tiling_context = holder.GetContext(); -// EXPECT_NE(tiling_context, nullptr); - -// } - -// TEST_F(TestTiling, TestTilingContextBuildAddInputs) -// { -// string active_type = "gelu"; -// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; -// std::vector inputs; -// std::vector outputs; -// context_ascendc::TensorInfo input; -// input.shape = x_shape; -// input.dType = ge::DT_FLOAT16; -// input.oriFormat = ge::FORMAT_ND; -// input.format = ge::FORMAT_ND; -// input.dataPath = "1111"; -// inputs.push_back(input); -// context_ascendc::TensorInfo output; -// output.shape = x_shape; -// output.dType = ge::DT_FLOAT16; -// output.oriFormat = ge::FORMAT_ND; -// output.format = ge::FORMAT_ND; -// output.dataPath = "222"; -// outputs.push_back(output); - -// auto param = gert::TilingData::CreateCap(4096); -// auto workspace_size_holer = gert::ContinuousVector::Create(4096); -// auto ws_size = reinterpret_cast(workspace_size_holer.get()); -// auto holder = context_ascendc::TilingContextBuilder() -// .SetOpNameType("name", "tpye") -// .NodeIoNum(1, 1) -// .IrInstanceNum({1}) -// .AddInputs(inputs) -// .AddOutputs(outputs) -// .AddAttrs({ -// {"activation", ge::AnyValue::CreateFrom(active_type)}, -// {"inner_precise", ge::AnyValue::CreateFrom(1)} -// }) -// .TilingData(param.get()) -// .Workspace(ws_size) -// .Build(); - -// gert::TilingContext* tiling_context = holder.GetContext(); -// EXPECT_NE(tiling_context, nullptr); -// } - -// TEST_F(TestTiling, TestTilingContextBuildFailed) -// { -// string active_type = "gelu"; -// gert::StorageShape x_shape = {{-1, 5120}, {-1, 5120}}; -// std::vector x_const_value (1024 * 5120, 2.f); -// auto param = gert::TilingData::CreateCap(4096); -// auto workspace_size_holer = gert::ContinuousVector::Create(4096); -// auto ws_size = reinterpret_cast(workspace_size_holer.get()); -// auto holder = context_ascendc::TilingContextBuilder() -// .NodeIoNum(1, 1) -// .IrInstanceNum({1, 1}) -// .CompileInfo(nullptr) -// .PlatformInfo(nullptr) -// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) -// .Workspace(ws_size) -// .Build(); - -// gert::TilingContext* tiling_context = holder.GetContext(); -// EXPECT_EQ(tiling_context, nullptr); -// } - -// TEST_F(TestTiling, TestTilingContextBuildWithBinFile) -// { -// string active_type = "gelu"; -// gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}}; -// gert::StorageShape expert_tokens_shape = {{16}, {16}}; -// gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}}; -// gert::StorageShape bias1_shape = {{16, 0}, {16, 0}}; -// gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}}; -// gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}}; -// gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}}; - -// std::vector expert_tokens_const_value (16, 1); - -// std::vector x_const_value (1024 * 5120, 2.f); -// std::vector bias2_value (16 * 5120, 3.f); -// auto param = gert::TilingData::CreateCap(4096); -// auto workspace_size_holer = gert::ContinuousVector::Create(4096); -// auto ws_size = reinterpret_cast(workspace_size_holer.get()); -// auto holder = context_ascendc::TilingContextBuilder() -// .SetOpNameType("name", "tpye") -// .NodeIoNum(6, 1) -// .IrInstanceNum({1, 1, 1, 1, 1, 1}) -// .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast(x_const_value.data())) -// .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape) -// .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape) -// .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, "./expert_tokens_data.bin") -// .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape) -// .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast(bias2_value.data())) -// .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape) -// .AddAttrs({ -// {"activation", ge::AnyValue::CreateFrom(active_type)}, -// {"inner_precise", ge::AnyValue::CreateFrom(1)} -// }) -// .TilingData(param.get()) -// .Workspace(ws_size) -// .Build(); -// gert::TilingContext* tiling_context = holder.GetContext(); -// EXPECT_EQ(tiling_context, nullptr); -// } - -TEST_F(TestTiling, TestAxpyTiling) -{ - uint32_t maxVal = 0; - uint32_t minVal = 0; - GetAxpyMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal); - EXPECT_EQ(maxVal, 0); - EXPECT_EQ(minVal, 0); - GetAxpyMaxMinTmpSize(ge::Shape({256}), 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 4 * 2); - EXPECT_EQ(minVal, 256 * 4); - GetAxpyMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal); - EXPECT_EQ(maxVal, 256 * 4); - EXPECT_EQ(minVal, 256 * 4); - uint32_t extraBuf = 123; - uint32_t maxLivedNodesCnt = 123; - GetAxpyTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf); - EXPECT_EQ(extraBuf, 0); - EXPECT_EQ(maxLivedNodesCnt, 1); - GetAxpyTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf); - EXPECT_EQ(extraBuf, 0); - EXPECT_EQ(maxLivedNodesCnt, 4); -} - -TEST_F(TestTiling, TestCeilTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto ceilShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCeilMaxMinTmpSize(ceilShape, sizeof(float), false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 1); - EXPECT_EQ(maxValue, 128 * 128 * 1 * 4); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetCeilTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 1); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestCeilTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto ceilShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetCeilTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestCeilTilingHalf512) -{ - std::vector shapeDims = { 512 }; - auto ceilShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 2 * 2); - EXPECT_EQ(minValue, 256 * 2); -} - -TEST_F(TestTiling, TestFloorTilingFloat) -{ - std::vector shapeDims = { 128, 128 }; - auto floorShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFloorMaxMinTmpSize(floorShape, sizeof(float), false, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetFloorTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 0); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestFloorTilingHalf) -{ - std::vector shapeDims = { 128, 128 }; - auto floorShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue); - EXPECT_EQ(minValue, 256 * 2); - EXPECT_EQ(maxValue, 128 * 128 * 2 * 2); - - uint32_t maxLiveNodeCnt; - uint32_t extraBuf; - GetFloorTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf); - EXPECT_EQ(maxLiveNodeCnt, 2); - EXPECT_EQ(extraBuf, 0); -} - -TEST_F(TestTiling, TestFloorTilingHalf512) -{ - std::vector shapeDims = { 512 }; - auto floorShape = ge::Shape(shapeDims); - uint32_t maxValue = 0; - uint32_t minValue = 0; - GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue); - EXPECT_EQ(maxValue, 512 * 2 * 2); - EXPECT_EQ(minValue, 256 * 2); -} - -// TEST_F(TestTiling, TestGetSocVersion) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); - -// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, -// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) -// .stubs() -// .will(returnValue(false)); - -// platform_ascendc::SocVersion ret = plat.GetSocVersion(); -// EXPECT_EQ(ret, platform_ascendc::SocVersion::RESERVED_VERSION); -// } - -// TEST_F(TestTiling, TestCoreNum) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); - -// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, -// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) -// .stubs() -// .will(returnValue(false)); - -// uint32_t ret1 = plat.GetCoreNumAic(); -// uint32_t ret2 = plat.GetCoreNumAiv(); -// EXPECT_EQ(ret1, 0); -// EXPECT_EQ(ret2, 0); -// } - -// TEST_F(TestTiling, TestGetLibApiWorkSpaceSize) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); - -// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, -// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) -// .stubs() -// .will(returnValue(false)); - -// uint32_t ret1 = plat.GetLibApiWorkSpaceSize(); -// EXPECT_EQ(ret1, static_cast(-1)); -// } -// TEST_F(TestTiling, TestPlatformAscendCManager) -// { -// void *handle; -// int a = 7; -// handle = &a; - -// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, -// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) -// .stubs() -// .will(returnValue(false)); - -// auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance(); -// } - -// TEST_F(TestTiling, TestGetVectorCoreNum) -// { -// fe::PlatFormInfos platform_info; -// auto plat = platform_ascendc::PlatformAscendC(&platform_info); - -// MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock, -// bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &)) -// .stubs() -// .will(returnValue(false)); -// MOCKER_CPP(&platform_ascendc::PlatformAscendC::GetSocVersion, -// platform_ascendc::SocVersion(platform_ascendc::PlatformAscendC::*)(void) const) -// .stubs() -// .will(returnValue(platform_ascendc::SocVersion::ASCEND310P)); - -// uint32_t ret1 = plat.GetCoreNumVector(); -// EXPECT_EQ(ret1, static_cast(0)); -// MOCKER_CPP(&platform_ascendc::PlatformAscendCManager::PlatformAscendCInit) -// .stubs() -// .will(returnValue(platform_info)); -// auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance(); - -// } - TEST_F(TestTiling, TestReGluFloat16OrBf16) { const std::vector srcShapeDims = { 8, 128 }; @@ -4581,234 +2915,4 @@ TEST_F(TestTiling, TestReGluFloat32) GetReGluMaxMinTmpSize(srcShape, 4, false, maxValue, minValue); EXPECT_EQ(minValue, 256); EXPECT_EQ(maxValue, 256); -} - -#if __CCE_AICORE__ == 220 -extern void platfrom_stub_set_chip_version(const char *num); -TEST_F(TestTiling, TestBroadCast220) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - platfrom_stub_set_chip_version("Ascend910B"); - uint32_t firstDim = 32; - uint32_t lastDim = 32; - std::vector srcShapeDims = {firstDim, 1}; - auto srcShape = ge::Shape(srcShapeDims); - std::vector dstShapeDims = {firstDim, lastDim}; - auto dstShape = ge::Shape(dstShapeDims); - uint32_t maxValue{0}; - uint32_t minValue{0}; - constexpr uint32_t halfSize = 2; - constexpr uint32_t halfOneBlockElementNum = 16; - constexpr uint32_t minHalfAlignSize = halfOneBlockElementNum * halfOneBlockElementNum * halfSize; - constexpr uint32_t BRCB_ONE_SIZE = 8; - uint32_t firstDimAlignNum = (firstDim + BRCB_ONE_SIZE - 1) / BRCB_ONE_SIZE * BRCB_ONE_SIZE; - uint32_t maxHalfAlignSize = firstDimAlignNum * halfOneBlockElementNum * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfAlignSize); - EXPECT_EQ(maxValue, maxHalfAlignSize); - - srcShapeDims = {firstDim, 1}; - srcShape = ge::Shape(srcShapeDims); - uint32_t lastDimNotAlign = 31; - dstShapeDims = {firstDim, lastDimNotAlign}; - dstShape = ge::Shape(dstShapeDims); - - uint32_t blockDimAlignBlockNum = (lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum; - uint32_t blockDimAlign = blockDimAlignBlockNum * halfOneBlockElementNum; - uint32_t minCopyTempBufferSize = halfOneBlockElementNum * blockDimAlign * halfSize; - auto minHalfNotAlignSize = minHalfAlignSize + minCopyTempBufferSize; - - uint32_t maxCopyTempBufferSize = firstDim * blockDimAlign * halfSize; - uint32_t maxHalfNotAlignValue = maxHalfAlignSize + maxCopyTempBufferSize; - - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfNotAlignSize); - EXPECT_EQ(maxValue, maxHalfNotAlignValue); - - constexpr uint32_t int8Size = 1; - srcShapeDims = {firstDim, 1}; - srcShape = ge::Shape(srcShapeDims); - dstShapeDims = {firstDim, lastDim}; - dstShape = ge::Shape(dstShapeDims); - const uint32_t alignSrcSize = - ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; - uint32_t alignDstSize = - ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; - uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize); - EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize); - - srcShapeDims = {firstDim, 1}; - srcShape = ge::Shape(srcShapeDims); - dstShapeDims = {firstDim, lastDimNotAlign}; - dstShape = ge::Shape(dstShapeDims); - alignDstSize = - ((firstDim * lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; - castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfNotAlignSize + castTempBufferSize); - EXPECT_EQ(maxValue, maxHalfNotAlignValue + castTempBufferSize); -} -#endif - -#if __CCE_AICORE__ == 200 -extern void platfrom_stub_set_chip_version(const char *num); -TEST_F(TestTiling, TestLastBroadCast200) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - platfrom_stub_set_chip_version("Ascend310P"); - uint32_t firstDim = 32; - uint32_t lastDim = 32; - std::vector srcShapeDims = {firstDim, 1}; - auto srcShape = ge::Shape(srcShapeDims); - std::vector dstShapeDims = {firstDim, lastDim}; - auto dstShape = ge::Shape(dstShapeDims); - uint32_t maxValue{0}; - uint32_t minValue{0}; - constexpr uint32_t halfSize = 2; - constexpr uint32_t halfOneBlockElementNum = 16; - constexpr uint32_t MAX_BLOCK_NUM = 8; - constexpr uint32_t ONE_BLOCK_SIZE = 32; - uint32_t minTmpBufferSize = - halfOneBlockElementNum * ((lastDim + MAX_BLOCK_NUM - 1) / MAX_BLOCK_NUM) * halfSize; - uint32_t minHalfAlignSize = ONE_BLOCK_SIZE + + minTmpBufferSize; - uint32_t maxHalfAlignSize = ONE_BLOCK_SIZE + firstDim * lastDim * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfAlignSize); - EXPECT_EQ(maxValue, maxHalfAlignSize); - - constexpr uint32_t int8Size = 1; - const uint32_t alignSrcSize = - ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; - const uint32_t alignDstSize = - ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum; - const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize); - EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize); -} - -TEST_F(TestTiling, TestFirstBroadCast200) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - platfrom_stub_set_chip_version("Ascend310P"); - uint32_t firstDim = 32; - uint32_t lastDim = 32; - std::vector srcShapeDims = {1, lastDim}; - auto srcShape = ge::Shape(srcShapeDims); - std::vector dstShapeDims = {firstDim, lastDim}; - auto dstShape = ge::Shape(dstShapeDims); - uint32_t maxValue{0}; - uint32_t minValue{0}; - constexpr uint32_t halfSize = 2; - constexpr uint32_t ONE_BLOCK_SIZE = 32; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); - EXPECT_EQ(minValue, ONE_BLOCK_SIZE); - EXPECT_EQ(maxValue, ONE_BLOCK_SIZE); - - constexpr uint32_t int8Size = 1; - constexpr uint32_t HALF_ONE_BLK_SIZE = 16; - const uint32_t alignSrcSize = ((lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; - const uint32_t alignDstSize = - ((firstDim * lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; - const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); - EXPECT_EQ(minValue, ONE_BLOCK_SIZE + castTempBufferSize); - EXPECT_EQ(maxValue, ONE_BLOCK_SIZE + castTempBufferSize); -} - -TEST_F(TestTiling, TestOneElementBroadCast200) -{ - fe::PlatFormInfos platform_info; - auto plat = platform_ascendc::PlatformAscendC(&platform_info); - platfrom_stub_set_chip_version("Ascend310P"); - uint32_t srcDim = 1; - uint32_t dstDim = 32; - std::vector srcShapeDims = {srcDim}; - auto srcShape = ge::Shape(srcShapeDims); - std::vector dstShapeDims = {dstDim}; - auto dstShape = ge::Shape(dstShapeDims); - uint32_t maxValue{0}; - uint32_t minValue{0}; - constexpr uint32_t halfSize = 2; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); - - constexpr uint32_t int8Size = 1; - constexpr uint32_t HALF_ONE_BLK_SIZE = 16; - constexpr uint32_t ONE_BLOCK_SIZE = 32; - const uint32_t alignSrcSize = ((srcDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; - const uint32_t alignDstSize = ((dstDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE; - const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize; - GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue); - EXPECT_EQ(minValue, castTempBufferSize + ONE_BLOCK_SIZE); - EXPECT_EQ(maxValue, castTempBufferSize + ONE_BLOCK_SIZE); -} -#endif - -TEST_F(TestTiling, TestReduceXorSumTilingInt16) -{ - std::vector shapeDims = { 128, 128 }; - auto shape = ge::Shape(shapeDims); - uint32_t maxSize; - uint32_t minSize; - GetReduceXorSumMaxMinTmpSize(shape, 2, true, maxSize, minSize); - EXPECT_EQ(maxSize, 65536); - EXPECT_EQ(minSize, 65536); - - GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 98304); - EXPECT_EQ(minSize, 98304); - - shapeDims = { 8 }; - shape = ge::Shape(shapeDims); - GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize); - EXPECT_EQ(maxSize, 768); - EXPECT_EQ(minSize, 768); - - GetReduceXorSumMaxMinTmpSize(shape, 2, true,maxSize, minSize); - EXPECT_EQ(maxSize, 512); - EXPECT_EQ(minSize, 512); -} - -TEST_F(TestTiling, TestCumSum) -{ - uint32_t firstDim = 32; - uint32_t lastDim = 16; - std::vector srcShapeDims = {firstDim, lastDim}; - auto srcShape = ge::Shape(srcShapeDims); - uint32_t maxValue{0}; - uint32_t minValue{0}; - constexpr uint32_t halfSize = 2; - constexpr uint32_t transDataTo5HDAddrListSize = 16; - uint32_t minHalfSize = transDataTo5HDAddrListSize * lastDim * 3 * sizeof(uint16_t); - uint32_t alignOutter = (firstDim + transDataTo5HDAddrListSize - 1) / transDataTo5HDAddrListSize * transDataTo5HDAddrListSize; - uint32_t maxHalfSize = alignOutter * lastDim * 3 * sizeof(uint16_t); - - GetCumSumMaxMinTmpSize(srcShape, halfSize, true, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfSize); - EXPECT_EQ(maxValue, maxHalfSize); - - constexpr uint32_t floatSize = 4; - uint32_t minFloatSize = transDataTo5HDAddrListSize * lastDim * 2 * sizeof(float); - uint32_t maxFloatSize = alignOutter * lastDim * 2 * sizeof(float); - - GetCumSumMaxMinTmpSize(srcShape, floatSize, true, false, maxValue, minValue); - EXPECT_EQ(minValue, minFloatSize); - EXPECT_EQ(maxValue, maxFloatSize); - - maxHalfSize = minHalfSize = firstDim * lastDim * sizeof(float); - GetCumSumMaxMinTmpSize(srcShape, halfSize, false, false, maxValue, minValue); - EXPECT_EQ(minValue, minHalfSize); - EXPECT_EQ(maxValue, maxHalfSize); - - - GetCumSumMaxMinTmpSize(srcShape, floatSize, false, false, maxValue, minValue); - EXPECT_EQ(minValue, 0); - EXPECT_EQ(maxValue, 0); } \ No newline at end of file -- Gitee From aa7700dbbadea2170bc36b681041f294df62ca95 Mon Sep 17 00:00:00 2001 From: jiangchengcheng-on Date: Mon, 22 Jul 2024 07:49:14 +0000 Subject: [PATCH 8/8] add llt case Signed-off-by: jiangchengcheng-on --- tests/tiling/test_tiling.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 219b8605..966bb8ad 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -77,6 +77,25 @@ TEST_F(TestTiling, PlatformConstructor) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, TestInt4BaseK) +{ + matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288, + .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536}; + MatmulApiTiling tiling(plat); + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); + tiling.SetBias(false); + tiling.SetShape(144, 256, 32); + tiling.SetOrgShape(144, 256, 32); + tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1); + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(tilingData.get_baseK() % 64, 0); + EXPECT_EQ(ret, 0); +} + TEST_F(TestTiling, Tiling_310p_NotAligned) { matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576, -- Gitee