From fde91fde4e8bbd8b141b07e19e42f1f690d316d9 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 03:46:22 +0000
Subject: [PATCH 1/8] matmul api iterateNorm refactory && fix int4 size

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 impl/matmul/matmul_impl.h               | 293 +++++++------
 impl/matmul/matmul_macro_def.h          |  89 ++++
 impl/matmul/matmul_server.h             |   6 +-
 impl/matmul/matmul_tiling_algorithm.cpp |  10 +-
 lib/matmul/matmul.h                     | 550 ++----------------------
 lib/matmul/tiling.h                     |  94 ++++
 6 files changed, 372 insertions(+), 670 deletions(-)
 create mode 100644 impl/matmul/matmul_macro_def.h

diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index e61f9a91..2642d7df 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -262,8 +262,10 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     const uint64_t quantScalar)
 {
 #if __CCE_AICORE__ >= 220
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
-        ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value)) {
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
+        ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+        IsSameType<DstT, int8_t>::value)) {
         var.quantScalar_ = quantScalar;
         if constexpr (IsSameType<DstT, half>::value) {
             var.quantMode_ = 1;
@@ -303,8 +305,10 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     const GlobalTensor<uint64_t>& quantTensor)
 {
 #if __CCE_AICORE__ >= 220
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
-        ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value)) {
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
+        ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+        IsSameType<DstT, int8_t>::value)) {
         var.quantTensor_ = quantTensor;
         if constexpr (IsSameType<DstT, half>::value) {
             var.quantMode_ = 2;
@@ -1022,6 +1026,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     int aMatrixByteSize = var.baseMK_ * sizeof(SrcT);
     int bMatrixByteSize = var.baseKN_ * sizeof(SrcT);
+    // The size of the int4b_t type obtained by sizeof is 1, but in reality, its actual size is 0.5
+    if constexpr (IsSameType<SrcT, int4b_t>::value) {
+        aMatrixByteSize = aMatrixByteSize / 2;
+        bMatrixByteSize = bMatrixByteSize / 2;
+    }
     if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
         if (var.tiling_->depthA1 > DB_FACTOR) {
             if (var.tiling_->depthA1 < var.kIter_ * var.tiling_->stepM) {
@@ -1124,7 +1133,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
     }
 #endif
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t));
@@ -1248,9 +1258,10 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     var.cacheA1IsCachingPong_ = false;
     var.cacheB1IsCachingPing_ = false;
     var.cacheB1IsCachingPong_ = false;
+
     uint32_t shareUbSize = static_cast<uint32_t>(var.tiling_->shareUbSize);
 #if __CCE_AICORE__ == 200
-    shareUbSize = 0;
+        shareUbSize = 0;
 #endif
     uint32_t shareLens[3] = {static_cast<uint32_t>(var.tiling_->shareL1Size),
         static_cast<uint32_t>(var.tiling_->shareL0CSize), shareUbSize};
@@ -1285,6 +1296,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     int aMatrixByteSize = var.baseMK_ * sizeof(SrcT);
     int bMatrixByteSize = var.baseKN_ * sizeof(SrcT);
+    // The size of the int4b_t type obtained by sizeof is 1, but in reality, its actual size is 0.5
+    if constexpr (IsSameType<SrcT, int4b_t>::value) {
+        aMatrixByteSize = aMatrixByteSize / 2;
+        bMatrixByteSize = bMatrixByteSize / 2;
+    }
 
     if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
         uint32_t cacheA1Size = var.tiling_->stepM * var.tiling_->stepKa;
@@ -1345,7 +1361,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 #endif
 
 #if __CCE_AICORE__ == 220
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
@@ -1449,6 +1466,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     int aMatrixByteSize = var.baseMK_ * sizeof(SrcT);
     int bMatrixByteSize = var.baseKN_ * sizeof(SrcT);
+    // The size of the int4b_t type obtained by sizeof is 1, but in reality, its actual size is 0.5
+    if constexpr (IsSameType<SrcT, int4b_t>::value) {
+        aMatrixByteSize = aMatrixByteSize / 2;
+        bMatrixByteSize = bMatrixByteSize / 2;
+    }
 
     if constexpr (A_TYPE::ibShare) {
         ASCENDC_ASSERT((B_TYPE::ibShare == false), {
@@ -1572,7 +1594,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     if (var.tiling_->isBias) {
         var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_->baseN * sizeof(BiasT));
     }
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.tpipe_->InitBuffer(var.qidFixPipe_, 1, var.tiling_->baseN * sizeof(int64_t));
@@ -1714,7 +1737,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             var.qidB1Cache_.FreeAllEvent();
         }
     }
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.qidFixPipe_.FreeAllEvent();
@@ -1749,7 +1773,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         }
     }
 #endif
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.qidFixPipe_.FreeAllEvent();
@@ -1796,7 +1821,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         var.CO1_.FreeAllEvent();
     }
 
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
         IsSameType<DstT, uint8_t>::value))) {
         var.qidFixPipe_.FreeAllEvent();
@@ -1925,7 +1951,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         var.aGlobal_ = gm.address_;
         var.isTransposeA_ = isTransposeA;
     }
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
@@ -1944,7 +1974,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     ResetCacheA();
     var.isTransposeA_ = isTransposeA;
     var.leftMatrix_ = leftMatrix.address_;
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 #if __CCE_AICORE__ >= 220
@@ -2055,7 +2089,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         var.bGlobal_ = gm.address_;
         var.isTransposeB_ = isTransposeB;
     }
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
@@ -2074,7 +2112,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     ResetCacheB();
     var.isTransposeB_ = isTransposeB;
     var.rightMatrix_ = righMatrix.address_;
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
@@ -2087,7 +2129,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     var.biasGlobal_ = biasGlobal.address_;
     var.enableBias_ = true;
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
@@ -2100,7 +2146,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     var.inputBias_ = inputBias.address_;
     var.enableBias_ = true;
-    var.isFirstIter_ = true;
+    if constexpr (DoMatmulNorm(MM_CFG)) {
+        MATMUL_MODULE(MatmulIterateController).Reset();
+    } else {
+        var.isFirstIter_ = true;
+    }
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
@@ -3672,7 +3722,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     bool enSequentialWrite)
 {
     LocalTensor<uint64_t> l1TmpForQuant;
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) ||
         ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value)) {
         if (var.quantMode_ % 2 == 0) {
@@ -3739,7 +3790,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+                if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) {
                     if (var.quantMode_ == 1) {
                         fixpipeParams.quantPre = QuantMode_t::DEQF16;
                         fixpipeParams.deqScalar = var.quantScalar_;
@@ -3777,7 +3829,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+                if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) {
                     if (var.quantMode_ == 1) {
                         fixpipeParams.quantParams = { QuantMode_t::DEQF16, var.quantScalar_ };
                         Fixpipe(gm, co1Local, fixpipeParams);
@@ -3841,7 +3894,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+                if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) ||
                     ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value)
                     && IsSameType<DstT, int8_t>::value)) {
                     if (var.quantMode_ == 1) {
@@ -3888,7 +3942,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+                if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) ||
                     ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value)
                     && IsSameType<DstT, int8_t>::value)) {
                     if (var.quantMode_ == 1) {
@@ -3962,7 +4017,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+                if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) ||
                     ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value)
                     && IsSameType<DstT, int8_t>::value)) {
                     if (var.quantMode_ == 1) {
@@ -4010,7 +4066,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+                if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) ||
                     ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value)
                     && IsSameType<DstT, int8_t>::value)) {
                     if (var.quantMode_ == 1) {
@@ -4083,7 +4140,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+                if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) {
                     if (var.quantMode_ == 1) {
                         fixpipeParams.quantPre = QuantMode_t::DEQF16;
                         fixpipeParams.deqScalar = var.quantScalar_;
@@ -4122,7 +4180,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (EnUnitFlag(MM_CFG)) {
                     fixpipeParams.unitFlag = 3;
                 }
-                if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+                if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+                    IsSameType<DstT, half>::value) {
                     if (var.quantMode_ == 1) {
                         fixpipeParams.quantParams = { QuantMode_t::DEQF16, var.quantScalar_ };
                         Fixpipe(gm[dstOffset], co1Local, fixpipeParams);
@@ -4160,7 +4219,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     if constexpr ((IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, half>::value ||
         IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) ||
         ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value)
-        && IsSameType<DstT, int8_t>::value)) {
+        && IsSameType<DstT, int8_t>::value) ||
+        (IsSameType<SrcT, int4b_t>::value && IsSameType<DstT, half>::value)) {
         if (var.quantMode_ == 2 || var.quantMode_ == 4 || var.quantMode_ == 6) {
             var.qidFixPipe_.FreeTensor(l1TmpForQuant);
         }
@@ -4608,49 +4668,51 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-__aicore__ inline TBufHandle MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheA1Buf(bool isPong)
+__aicore__ inline auto& MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheA1(bool isPong)
 {
-    return isPong ? var.cacheA1BufPong_ : var.cacheA1BufPing_;
+    return isPong ? var.cacheA1Pong_ : var.cacheA1Ping_;
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-__aicore__ inline TBufHandle MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheB1Buf(bool isPong)
+__aicore__ inline auto& MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheB1(bool isPong)
 {
-    return isPong ? var.cacheB1BufPong_ : var.cacheB1BufPing_;
+    return isPong ? var.cacheB1Pong_ : var.cacheB1Ping_;
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
 __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheA1IsCaching(bool isPong)
+    const
 {
     return isPong ? var.cacheA1IsCachingPong_ : var.cacheA1IsCachingPing_;
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
 __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::GetCacheB1IsCaching(bool isPong)
+    const
 {
     return isPong ? var.cacheB1IsCachingPong_ : var.cacheB1IsCachingPing_;
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-__aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::SetCacheA1Buf(
-    bool isPong, TBufHandle buf)
+__aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::SetCacheA1(
+    bool isPong, const LocalTensor<SrcT>& a1)
 {
     if (isPong) {
-        var.cacheA1BufPong_ = buf;
+        var.cacheA1Pong_ = a1;
     } else {
-        var.cacheA1BufPing_ = buf;
+        var.cacheA1Ping_ = a1;
     }
     return;
 };
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
-__aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::SetCacheB1Buf(
-    bool isPong, TBufHandle buf)
+__aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::SetCacheB1(
+    bool isPong, const LocalTensor<SrcT>& b1)
 {
     if (isPong) {
-        var.cacheB1BufPong_ = buf;
+        var.cacheB1Pong_ = b1;
     } else {
-        var.cacheB1BufPing_ = buf;
+        var.cacheB1Ping_ = b1;
     }
     return;
 };
@@ -5953,7 +6015,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             if (k / var.kaStepFactor_ > var.stepKaIdx_) {
                 int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_;
                 if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosKa)) {
-                    var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosKa));
+                    var.qidA1_.FreeTensor(GetCacheA1(cachePosKa));
                     SetCacheA1IsCaching(cachePosKa, false);
                 }
             }
@@ -5962,7 +6024,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             if (k / var.kbStepFactor_ > var.stepKbIdx_) {
                 int cachePosKb = var.stepKbIdx_ & var.cacheB1Factor_;
                 if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosKb)) {
-                    var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosKb));
+                    var.qidB1_.FreeTensor(GetCacheB1(cachePosKb));
                     SetCacheB1IsCaching(cachePosKb, false);
                 }
             }
@@ -6165,21 +6227,21 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
         if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(0)) {
-            var.qidA1_.FreeBuffer(GetCacheA1Buf(0));
+            var.qidA1_.FreeTensor(GetCacheA1(0));
             SetCacheA1IsCaching(0, false);
         }
         if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(1)) {
-            var.qidA1_.FreeBuffer(GetCacheA1Buf(1));
+            var.qidA1_.FreeTensor(GetCacheA1(1));
             SetCacheA1IsCaching(1, false);
         }
     }
     if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
         if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(0)) {
-            var.qidB1_.FreeBuffer(GetCacheB1Buf(0));
+            var.qidB1_.FreeTensor(GetCacheB1(0));
             SetCacheB1IsCaching(0, false);
         }
         if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(1)) {
-            var.qidB1_.FreeBuffer(GetCacheB1Buf(1));
+            var.qidB1_.FreeTensor(GetCacheB1(1));
             SetCacheB1IsCaching(1, false);
         }
     }
@@ -6306,7 +6368,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             if (k / var.kaStepFactor_ > var.stepKaIdx_) {
                 int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_;
                 if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosKa)) {
-                    var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosKa));
+                    var.qidA1_.FreeTensor(GetCacheA1(cachePosKa));
                     SetCacheA1IsCaching(cachePosKa, false);
                 }
             }
@@ -6315,7 +6377,7 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             if (k / var.kbStepFactor_ > var.stepKbIdx_) {
                 int cachePosKb = var.stepKbIdx_ & var.cacheB1Factor_;
                 if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosKb)) {
-                    var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosKb));
+                    var.qidB1_.FreeTensor(GetCacheB1(cachePosKb));
                     SetCacheB1IsCaching(cachePosKb, false);
                 }
             }
@@ -6479,21 +6541,21 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
 
     if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
         if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(0)) {
-            var.qidA1_.FreeBuffer(GetCacheA1Buf(0));
+            var.qidA1_.FreeTensor(GetCacheA1(0));
             SetCacheA1IsCaching(0, false);
         }
         if (!var.isA1KFullLoad_ && GetCacheA1IsCaching(1)) {
-            var.qidA1_.FreeBuffer(GetCacheA1Buf(1));
+            var.qidA1_.FreeTensor(GetCacheA1(1));
             SetCacheA1IsCaching(1, false);
         }
     }
     if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
         if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(0)) {
-            var.qidB1_.FreeBuffer(GetCacheB1Buf(0));
+            var.qidB1_.FreeTensor(GetCacheB1(0));
             SetCacheB1IsCaching(0, false);
         }
         if (!var.isB1KFullLoad_ && GetCacheB1IsCaching(1)) {
-            var.qidB1_.FreeBuffer(GetCacheB1Buf(1));
+            var.qidB1_.FreeTensor(GetCacheB1(1));
             SetCacheB1IsCaching(1, false);
         }
     }
@@ -6764,69 +6826,9 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     if constexpr (MM_CFG.scheduleType == ScheduleType::OUTER_PRODUCT) {
         return IterateNormL0DB(enPartialSum);
     }
-    if (unlikely(var.isFirstIter_)) {
-        var.isFirstIter_ = false;
-        var.curM_ = 0;
-        var.curN_ = 0;
-        var.stepMIdx_ = 0;
-        var.stepNIdx_ = 0;
-        var.curStepM_ = (var.mIter_ - var.curM_) > var.tiling_->stepM ? var.tiling_->stepM : (var.mIter_ - var.curM_);
-        var.curStepN_ = (var.nIter_ - var.curN_) > var.tiling_->stepN ? var.tiling_->stepN : (var.nIter_ - var.curN_);
-    } else if (likely(var.tiling_->iterateOrder == static_cast<int>(IterateOrder::ORDER_M))) { // Output along M axis
-        if (++var.curN_ >= var.stepNIdx_ + var.curStepN_) {
-            if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
-                if (var.cacheProcA_ > 0) {
-                    var.qidA1Cache_.FreeTensor(var.cacheHeadA1_);
-                    var.cacheProcA_ = 0;
-                }
-            }
-            var.curN_ = var.stepNIdx_;
-            if (++var.curM_ >= var.mIter_) {
-                if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
-                    if (var.cacheProcB_ > 0) {
-                        var.qidB1Cache_.FreeTensor(var.cacheHeadB1_);
-                        var.cacheProcB_ = 0;
-                    }
-                }
-                var.curM_ = 0;
-                var.stepNIdx_ += var.curStepN_;
-                if (var.stepNIdx_ >= var.nIter_) {
-                    return false;
-                }
-                var.curN_ = var.stepNIdx_;
-                var.curStepN_ =
-                    (var.nIter_ - var.curN_) > var.tiling_->stepN ? var.tiling_->stepN : (var.nIter_ - var.curN_);
-            }
-        }
-    } else {
-        ASCENDC_ASSERT((var.tiling_->iterateOrder == static_cast<int>(IterateOrder::ORDER_N)), {
-            KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N", var.tiling_->iterateOrder);
-        });
-        if (++var.curM_ >= var.stepMIdx_ + var.curStepM_) {
-            if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
-                if (var.cacheProcB_ > 0) {
-                    var.qidB1Cache_.FreeTensor(var.cacheHeadB1_);
-                    var.cacheProcB_ = 0;
-                }
-            }
-            var.curM_ = var.stepMIdx_;
-            if (++var.curN_ >= var.nIter_) {
-                if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
-                    if (var.cacheProcA_ > 0) {
-                        var.qidA1Cache_.FreeTensor(var.cacheHeadA1_);
-                        var.cacheProcA_ = 0;
-                    }
-                }
-                var.curN_ = 0;
-                var.stepMIdx_ += var.curStepM_;
-                if (var.stepMIdx_ >= var.mIter_) {
-                    return false;
-                }
-                var.curM_ = var.stepMIdx_;
-                var.curStepM_ =
-                    (var.mIter_ - var.curM_) > var.tiling_->stepM ? var.tiling_->stepM : (var.mIter_ - var.curM_);
-            }
-        }
+
+    if (!MATMUL_MODULE(MatmulIterateController).MoveNext()) {
+        return false;
     }
     // Initializing variables
     var.baseUseM_ = (var.curM_ + 1 == var.mIter_) ? var.tailM_ : var.tiling_->baseM;
@@ -7015,14 +7017,14 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7041,7 +7043,7 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
@@ -7061,14 +7063,14 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7087,7 +7089,7 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7346,14 +7348,14 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7373,7 +7375,7 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
@@ -7393,14 +7395,14 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
                     int cachePosA = var.stepMIdx_ & var.cacheA1Factor_;
                     if (var.isA1KFullLoad_ && GetCacheA1IsCaching(cachePosA)) {
-                        var.qidA1_.FreeBuffer(GetCacheA1Buf(cachePosA));
+                        var.qidA1_.FreeTensor(GetCacheA1(cachePosA));
                         SetCacheA1IsCaching(cachePosA, false);
                     }
                 }
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7420,7 +7422,7 @@ __aicore__ inline bool MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
                 if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
                     int cachePosB = var.stepNIdx_ & var.cacheB1Factor_;
                     if (var.isB1KFullLoad_ && GetCacheB1IsCaching(cachePosB)) {
-                        var.qidB1_.FreeBuffer(GetCacheB1Buf(cachePosB));
+                        var.qidB1_.FreeTensor(GetCacheB1(cachePosB));
                         SetCacheB1IsCaching(cachePosB, false);
                     }
                 }
@@ -7636,11 +7638,11 @@ MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::LoadToAL1MDL(int r
                 if (insertDeQue) {
                     var.qidA1_.DeQue();
                 }
-                SetCacheA1Buf(cachePosA, a1.GetBufferHandle());
+                SetCacheA1(cachePosA, a1);
                 SetCacheA1IsCaching(cachePosA, true);
             } else {
                 DEBUG_CODE(++a1LoadCacheCount_);
-                a1.SetAddr(var.qidA1_.GetBufferAddr(GetCacheA1Buf(cachePosA)));
+                a1 = GetCacheA1(cachePosA);
             }
         } else {
             int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_;
@@ -7652,11 +7654,11 @@ MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::LoadToAL1MDL(int r
                 OnCopyInA1(a1, row, col, var.baseUseStepM_, var.baseUseStepKa_);
                 var.qidA1_.EnQue(a1);
                 var.qidA1_.DeQue();
-                SetCacheA1Buf(cachePosKa, a1.GetBufferHandle());
+                SetCacheA1(cachePosKa, a1);
                 SetCacheA1IsCaching(cachePosKa, true);
             } else {
                 DEBUG_CODE(++a1LoadCacheCount_);
-                a1.SetAddr(var.qidA1_.GetBufferAddr(GetCacheA1Buf(cachePosKa)));
+                a1 = GetCacheA1(cachePosKa);
             }
         }
         return a1;
@@ -7871,14 +7873,14 @@ MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::LoadToBL1MDL(int r
                 if (insertDeQue) {
                     var.qidB1_.DeQue();
                 }
-                SetCacheB1Buf(cachePosB, b1.GetBufferHandle());
+                SetCacheB1(cachePosB, b1);
                 SetCacheB1IsCaching(cachePosB, true);
             } else {
 #if __CCE_AICORE__ == 200
                 SetTransposeB(true);
 #endif
                 DEBUG_CODE(++b1LoadCacheCount_);
-                b1.SetAddr(var.qidB1_.GetBufferAddr(GetCacheB1Buf(cachePosB)));
+                b1 = GetCacheB1(cachePosB);
             }
         } else {
             int cachePosKa = var.stepKaIdx_ & var.cacheA1Factor_;
@@ -7891,14 +7893,14 @@ MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>::LoadToBL1MDL(int r
                 if (insertDeQue) {
                     var.qidB1_.DeQue();
                 }
-                SetCacheB1Buf(cachePosKb, b1.GetBufferHandle());
+                SetCacheB1(cachePosKb, b1);
                 SetCacheB1IsCaching(cachePosKb, true);
             } else {
 #if __CCE_AICORE__ == 200
                 SetTransposeB(true);
 #endif
                 DEBUG_CODE(++b1LoadCacheCount_);
-                b1.SetAddr(var.qidB1_.GetBufferAddr(GetCacheB1Buf(cachePosKb)));
+                b1 = GetCacheB1(cachePosKb);
             }
         }
         return b1;
@@ -8158,11 +8160,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             }
         } else if constexpr (DoMatmulMDL(MM_CFG) ||  DoMatmulSpecialMDL(MM_CFG)) {
             if (var.cacheA1IsCachingPing_) {
-                var.qidA1_.FreeBuffer(var.cacheA1BufPing_);
+                var.qidA1_.FreeTensor(var.cacheA1Ping_);
                 var.cacheA1IsCachingPing_ = false;
             }
             if (var.cacheA1IsCachingPong_) {
-                var.qidA1_.FreeBuffer(var.cacheA1BufPong_);
+                var.qidA1_.FreeTensor(var.cacheA1Pong_);
                 var.cacheA1IsCachingPong_ = false;
             }
         } else {
@@ -8186,11 +8188,11 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
             }
         } else if constexpr (DoMatmulMDL(MM_CFG) || DoMatmulSpecialMDL(MM_CFG)) {
             if (var.cacheB1IsCachingPing_) {
-                var.qidB1_.FreeBuffer(var.cacheB1BufPing_);
+                var.qidB1_.FreeTensor(var.cacheB1Ping_);
                 var.cacheB1IsCachingPing_ = false;
             }
             if (var.cacheB1IsCachingPong_) {
-                var.qidB1_.FreeBuffer(var.cacheB1BufPong_);
+                var.qidB1_.FreeTensor(var.cacheB1Pong_);
                 var.cacheB1IsCachingPong_ = false;
             }
         } else {
@@ -9953,7 +9955,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
     const uint32_t mGapOffsetIn)
 {
     LocalTensor<uint64_t> l1TmpForQuant;
-    if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+    if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+        IsSameType<DstT, half>::value) ||
         (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value || IsSameType<DstT, uint8_t>::value)) ||
         ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) && IsSameType<DstT, int8_t>::value)) {
         if (var.quantMode_ % 2 == 0) {
@@ -10005,7 +10008,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         if constexpr (EnUnitFlag(MM_CFG)) {
             fixpipeParams.unitFlag = 3;
         }
-        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+        if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+            IsSameType<DstT, half>::value) {
             if (var.quantMode_ == 1) {
                 fixpipeParams.quantPre = QuantMode_t::DEQF16;
                 fixpipeParams.deqScalar = var.quantScalar_;
@@ -10045,7 +10049,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         if constexpr (EnUnitFlag(MM_CFG)) {
             fixpipeParams.unitFlag = 3;
         }
-        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+        if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+            IsSameType<DstT, half>::value) {
             if (var.quantMode_ == 1) {
                 fixpipeParams.quantParams = {QuantMode_t::DEQF16, var.quantScalar_};
                 Fixpipe(gm[dstOffset], co1Local, fixpipeParams);
@@ -10090,7 +10095,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         if constexpr (EnUnitFlag(MM_CFG)) {
             fixpipeParams.unitFlag = 3;
         }
-        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+        if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+            IsSameType<DstT, half>::value) {
             if (var.quantMode_ == 1) {
                 fixpipeParams.quantPre = QuantMode_t::DEQF16;
                 fixpipeParams.deqScalar = var.quantScalar_;
@@ -10131,7 +10137,8 @@ __aicore__ inline void MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_
         if constexpr (EnUnitFlag(MM_CFG)) {
             fixpipeParams.unitFlag = 3;
         }
-        if constexpr (IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) {
+        if constexpr ((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+            IsSameType<DstT, half>::value) {
             if (var.quantMode_ == 1) {
                 fixpipeParams.quantParams = {QuantMode_t::DEQF16, var.quantScalar_};
                 Fixpipe(gm[dstOffset], co1Local, fixpipeParams);
diff --git a/impl/matmul/matmul_macro_def.h b/impl/matmul/matmul_macro_def.h
new file mode 100644
index 00000000..bf86e3c4
--- /dev/null
+++ b/impl/matmul/matmul_macro_def.h
@@ -0,0 +1,89 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_macro_def.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MATMUL_MACRO_DEF_H
+#define IMPL_MATMUL_MATMUL_MACRO_DEF_H
+#include "matmul_utils.h"
+#include "matmul_macro_v220_impl.h"
+#include "matmul_macro_v220_basic_impl.h"
+#include "matmul_macro_v200_impl.h"
+#include "modules/matmul_param.h"
+
+namespace matmul {
+
+/* **************************************************************************************************
+ * MatmulMacroImpl                                             *
+ * ************************************************************************************************* */
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, MatmulVersion MM_VER>
+struct MatmulMacroImpl {
+    __aicore__ inline MatmulMacroImpl() {};
+};
+
+#if __CCE_AICORE__ >= 220
+// CFG_NORM
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
+        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
+    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
+        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
+};
+// CFG_MDL
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
+        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
+    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
+        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
+};
+// CFG_IBSHARE_NORM
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_IBSHARE_NORM)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
+        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
+    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
+        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
+};
+#elif __CCE_AICORE__ == 200
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    using PARAMS = MacroMatmulV200<L0cT, typename A_TYPE::T, typename A_TYPE::T>;
+};
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    using PARAMS = MacroMatmulV200<L0cT, typename A_TYPE::T, typename A_TYPE::T>;
+};
+#endif
+
+// MM_CFG_BB
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG_BB)> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulMacroImpl() {};
+    using PARAMS = MacroMatmulBasic<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
+        BIAS_TYPE::isTrans, MM_CFG>;
+};
+
+}
+#endif // _MATMUL_MACRO_DEF_H_
\ No newline at end of file
diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h
index caf8a3de..327cc390 100644
--- a/impl/matmul/matmul_server.h
+++ b/impl/matmul/matmul_server.h
@@ -601,7 +601,8 @@ public:
         } else {
             ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg.");
         }
-        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+        if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+           IsSameType<DstT, half>::value) ||
            ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
            IsSameType<DstT, int8_t>::value) ||
            (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
@@ -706,7 +707,8 @@ public:
         if constexpr (A_TYPE::layout != LayoutMode::NONE) {
             return true;
         }
-        if constexpr ((IsSameType<SrcT, int8_t>::value && IsSameType<DstT, half>::value) ||
+        if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+           IsSameType<DstT, half>::value) ||
            ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
            IsSameType<DstT, int8_t>::value) ||
            (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp
index 61ad6fc3..c9871710 100644
--- a/impl/matmul/matmul_tiling_algorithm.cpp
+++ b/impl/matmul/matmul_tiling_algorithm.cpp
@@ -1450,6 +1450,7 @@ void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const
             biasLength = max(quantLength, biasLength);
         }
     }
+
     transLength = max(max(a1Length, b1Length), max(c1Length, biasLength));
 }
 
@@ -2278,7 +2279,12 @@ int64_t MatmulTilingAlgorithm::Process()
     tilingIns_->tiling_.set_baseM(singleCoreStatus.l0Status.mL0 * C0_SIZE);
     tilingIns_->tiling_.set_baseN(singleCoreStatus.l0Status.nL0 * C0_SIZE);
     const int32_t reduceSize = C0_BYTE_SIZE / DTYPE_BYTE_TAB.at(tilingIns_->aType_.dataType);
-    tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize);
+    // int4 baseK should be 64 align
+    if ((tilingIns_->aType_.dataType == DataType::DT_INT4) && (singleCoreStatus.l0Status.kL0 % NUM_TWO != 0)) {
+        tilingIns_->tiling_.set_baseK((singleCoreStatus.l0Status.kL0 + 1) * reduceSize);
+    } else {
+        tilingIns_->tiling_.set_baseK(singleCoreStatus.l0Status.kL0 * reduceSize);
+    }
     tilingIns_->baseM = tilingIns_->tiling_.get_baseM();
     tilingIns_->baseN = tilingIns_->tiling_.get_baseN();
     tilingIns_->baseK = tilingIns_->tiling_.get_baseK();
@@ -2343,4 +2349,4 @@ int64_t MatmulTilingAlgorithm::Process()
     const bool ans = CheckFinaleParams(coreStatus);
     return ans ? 0 : -1;
 }
-} // namespace matmul_tiling
+} // namespace matmul_tiling
\ No newline at end of file
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index 9377f7b7..20703311 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -17,26 +17,17 @@
 
 #include <type_traits>
 #include "lib/matmul/tiling.h"
-#include "../../impl/matmul/matmul_macro_v220_impl.h"
-#include "../../impl//matmul/matmul_macro_v220_basic_impl.h"
-#include "../../impl//matmul/matmul_macro_v200_impl.h"
 #include "../../impl/matmul/matmul_utils.h"
 #include "../../impl/matmul/matmul_call_back.h"
+#include "../../impl/matmul/modules/matmul_module.h"
+#include "../../impl/matmul/modules/matmul_param.h"
+#include "../../impl/matmul/modules/iterator/matmul_iterate_controller.h"
+#include "../../impl/matmul/modules/feature_trait/matmul_feature_trait.h"
+#include "../../impl/matmul/matmul_macro_def.h"
 
 namespace matmul {
 using namespace AscendC;
 
-template <TPosition POSITION, CubeFormat FORMAT, typename TYPE, bool ISTRANS = false,
-          LayoutMode LAYOUT = LayoutMode::NONE, bool IBSHARE = false>
-struct MatmulType {
-    constexpr static TPosition pos = POSITION;
-    constexpr static CubeFormat format = FORMAT;
-    using T = TYPE;
-    constexpr static bool isTrans = ISTRANS;
-    constexpr static LayoutMode layout = LAYOUT;
-    constexpr static bool ibShare = IBSHARE;
-};
-
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
 struct MatmulApiConfig {
     using AType = A_TYPE;
@@ -46,511 +37,18 @@ struct MatmulApiConfig {
     constexpr static MatmulConfig Config = MM_CFG;
 };
 
-/* **************************************************************************************************
- * MatmulParamsBase                                             *
- * ************************************************************************************************* */
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsBase {
-    __aicore__ inline MatmulParamsBase() {};
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulParamsNorm() {};
-    using SrcT = typename A_TYPE::T;
-    using SrcBT = typename B_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
-    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
-#if __CCE_AICORE__ < 220
-    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
-    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
-    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
-    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
-
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB1_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1Cache_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB1Cache_;
-#else
-    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1_;
-    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1_;
-    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1Cache_;
-    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1Cache_;
-#endif
-
-    LocalTensor<L0cT> cMatrix_;
-
-    LocalTensor<SrcT> cacheHeadA1_; // Allocate and release using qidA1Cache_
-    LocalTensor<SrcT> cacheHeadB1_; // Allocate and release using qidB1Cache_
-    LocalTensor<BiasT> cacheHeadBias_; // Allocate and release using qidBias_
-
-    SrcT aScalar_;
-    SrcT bScalar_;
-    DEBUG_CODE(int calCount_ = 0);
-
-    TBuffAddr leftMatrix_;
-    TBuffAddr rightMatrix_;
-    TBuffAddr inputBias_;
-
-    __gm__ SrcT* aGlobal_;
-    __gm__ SrcBT* bGlobal_;
-    __gm__ BiasT* biasGlobal_;
-
-    TPipe* tpipe_;
-    const TCubeTiling* __restrict tiling_;
-    __gm__ uint8_t* cacheWorkspaceAddr;
-
-#if __CCE_AICORE__ < 220
-    __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr;
-    LocalTensor<uint8_t> localWorkspace;
-    int nd2nz0ffset = 0;
-    int transOffset = 0;
-    int co2Offset = 0;
-#endif
-
-    int singleCoreM_;
-    int singleCoreN_;
-    int singleCoreK_;
-    // iterate nums in mnk axis
-    int mIter_;
-    int nIter_;
-    int kIter_;
-
-    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
-    // measured in element
-    int baseUseM_;
-    int baseUseK_;
-    int baseUseN_;
-    // measured in cube block
-    int blockUseM_;
-    int blockUseK_;
-    int blockUseN_;
-
-    int32_t cacheProcA_, cacheProcB_;
-    bool isFirstIter_;
-    bool isTransposeA_; // whether A matrix need to transpose
-    bool isTransposeB_; // whether B matrix need to transpose
-    // whether enbale bias, default value is false
-    bool enableBias_;
-
-    int tailM_, tailK_, tailN_;
-    // current c matrix coordinate
-    int curM_, curN_;
-    // current c matrix step size, there could be tail steps
-    int curStepM_, curStepN_;
-    // current c matrix step block coordinate
-    int stepMIdx_, stepNIdx_;
-
-    bool enHF32Mode_;
-    int32_t hf32TransMode_;
-    uint8_t subBlockIdx_;
-
-    int baseMK_;
-    int baseKN_;
-    int baseMN_;
-
-    int cacheA1Size_, cacheB1Size_;
-    int depthA1_, depthB1_;
-#if __CCE_AICORE__ >= 220
-    int sMadMStep_ = 0;
-    int sMadNStep_ = 0;
-#endif
-    uint64_t dataPtr_;
-    uint64_t tilingPtr_;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsNormQuant : public MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
-    __aicore__ inline MatmulParamsNormQuant() {};
-    TQue<TPosition::C1, QUEUE_DEPTH> qidFixPipe_;
-    uint64_t quantScalar_ = 0;
-    GlobalTensor<uint64_t> quantTensor_;
-    // 0: no quant, 1: deqf16, 2: vdeqf16, 3: QF322B8_PRE, 4: VQF322B8_PRE, 5: REQ8(s32->u8/s8), 6: VREQ8(s32->u8/s8)
-    uint8_t quantMode_ = 0;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsMDL : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulParamsMDL() {};
-    using SrcT = typename A_TYPE::T;
-    using SrcBT = typename B_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-
-    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
-    TQue<TPosition::C1, QUEUE_DEPTH> qidFixPipe_;
-    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
-    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1_;
-    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1_;
-#if __CCE_AICORE__ < 220
-    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
-    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
-    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
-    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
-
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA12UBCache_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB12UBCache_;
-#endif
-
-    LocalTensor<L0cT> cMatrix_;
-
-    TBufHandle cacheA1BufPing_;
-    TBufHandle cacheA1BufPong_;
-    TBufHandle cacheB1BufPing_;
-    TBufHandle cacheB1BufPong_;
-    bool cacheA1IsCachingPing_;
-    bool cacheA1IsCachingPong_;
-    bool cacheB1IsCachingPing_;
-    bool cacheB1IsCachingPong_;
-
-    DEBUG_CODE(int calCount_ = 0);
-
-    TBuffAddr leftMatrix_;
-    TBuffAddr rightMatrix_;
-    TBuffAddr inputBias_;
-
-    __gm__ SrcT* aGlobal_;
-    __gm__ SrcBT* bGlobal_;
-    __gm__ BiasT* biasGlobal_;
-
-    TPipe* tpipe_;
-    const TCubeTiling* __restrict tiling_;
-    __gm__ uint8_t* cacheWorkspaceAddr;
-
-#if __CCE_AICORE__ < 220
-    __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr;
-    LocalTensor<uint8_t> localWorkspace;
-    LocalTensor<SrcT> cacheHeadA12UB_; // Allocate and release using qidA12UBCache_
-    LocalTensor<SrcT> cacheHeadB12UB_; // Allocate and release using qidB12UBCache_
-    int nd2nz0ffset = 0;
-    int transOffset = 0;
-    int co2Offset = 0;
-    int32_t cacheA12UBProcA_ = 0;
-    int32_t cacheB12UBProcB_ = 0;
-#endif
-
-    int singleCoreM_;
-    int singleCoreN_;
-    int singleCoreK_;
-    // iterate nums in mnk axis
-    int mIter_;
-    int nIter_;
-    int kIter_;
-    // iterate nums in mn step axis
-    int mStepIter_;
-    int nStepIter_;
-    int kaStepIter_;
-    int kbStepIter_;
-    int kStepIter_;
-    int minStepK_;
-    int kaStepFactor_;
-    int kbStepFactor_;
-
-    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
-    // in unit of element
-    int baseUseM_;
-    int baseUseK_;
-    int baseUseN_;
-    // in unit of cube block
-    int blockUseM_;
-    int blockUseK_;
-    int blockUseN_;
-
-    // in unit of element
-    int baseUseStepM_;
-    int baseUseStepN_;
-    int baseUseStepKa_;
-    int baseUseStepKb_;
-    // in unit of cube block
-    int blockUseStepM_;
-    int blockUseStepN_;
-    int blockUseStepKa_;
-    int blockUseStepKb_;
-
-    bool isFirstIter_;
-    bool isTransposeA_; // whether A matrix need to transpose
-    bool isTransposeB_; // whether B matrix need to transpose
-    // whether enbale bias, default value is false
-    bool enableBias_;
-
-    // in unit of element
-    int tailM_, tailK_, tailN_;
-    // in unit of element
-    int tailStepM_, tailStepN_, tailStepKa_, tailStepKb_;
-    // current c matrix coordinate, in unit of baseMN
-    int curM_, curN_;
-    // current c matrix step size, in unit of baseMNK , there could be tail steps
-    int curStepM_, curStepN_;
-    // current c matrix step block coordinate, in unit of stepMNK
-    int stepMIdx_, stepNIdx_, stepKaIdx_, stepKbIdx_;
-
-    // stepKa == kIter
-    bool isA1KFullLoad_, isB1KFullLoad_;
-
-    bool enHF32Mode_;
-    int32_t hf32TransMode_;
-    uint8_t subBlockIdx_;
-
-    int baseMK_;
-    int baseKN_;
-    int baseMN_;
-    int cacheA1Factor_, cacheB1Factor_;
-    uint64_t quantScalar_ = 0;
-    uint64_t dataPtr_;
-    uint64_t tilingPtr_;
-    GlobalTensor<uint64_t> quantTensor_;
-    // 0: no quant, 1: deqf16, 2: vdeqf16;
-    uint8_t quantMode_ = 0;
-    // anti quant param.
-    SrcT antiQuantOffsetScalar_;
-    SrcT antiQuantScaleScalar_;
-    LocalTensor<SrcT> antiQuantOffsetTensor_;
-    LocalTensor<SrcT> antiQuantScaleTensor_;
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsBasicBlock : public MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
-    __aicore__ inline MatmulParamsBasicBlock() {};
-};
-
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParamsIBShareNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulParamsIBShareNorm() {};
-    using SrcT = typename A_TYPE::T;
-    using DstT = typename C_TYPE::T;
-    using BiasT = typename BIAS_TYPE::T;
-    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
-    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
-
-    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
-    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
-    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
-    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
-
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1Cache_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidB1_;
-    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidB1Cache_;
-
-    LocalTensor<L0cT> cMatrix_;
-
-    LocalTensor<SrcT> cacheHeadA1_; // Allocate and release using qidA1Cache_
-    LocalTensor<SrcT> cacheHeadB1_; // Allocate and release using qidB1Cache_
-    LocalTensor<BiasT> cacheHeadBias_; // Allocate and release using qidBias_
-
-    SrcT aScalar_;
-    SrcT bScalar_;
-    DEBUG_CODE(int calCount_ = 0);
-
-    TBuffAddr leftMatrix_;
-    TBuffAddr rightMatrix_;
-    TBuffAddr inputBias_;
-
-    __gm__ SrcT* aGlobal_;
-    __gm__ SrcT* bGlobal_;
-    __gm__ BiasT* biasGlobal_;
-
-    TPipe* tpipe_;
-    const TCubeTiling* __restrict tiling_;
-    __gm__ uint8_t* cacheWorkspaceAddr;
-
-    int singleCoreM_;
-    int singleCoreN_;
-    int singleCoreK_;
-    // iterate nums in mnk axis
-    int mIter_;
-    int nIter_;
-    int kIter_;
-
-    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
-    // measured in element
-    int baseUseM_;
-    int baseUseK_;
-    int baseUseN_;
-    // measured in cube block
-    int blockUseM_;
-    int blockUseK_;
-    int blockUseN_;
-
-    int32_t cacheProcA_, cacheProcB_;
-    bool isFirstIter_;
-    bool isTransposeA_; // whether A matrix need to transpose
-    bool isTransposeB_; // whether B matrix need to transpose
-    // whether enbale bias, default value is false
-    bool enableBias_;
-
-    int tailM_, tailK_, tailN_;
-    // current c matrix coordinate
-    int curM_, curN_;
-    // current c matrix step size, there could be tail steps
-    int curStepM_, curStepN_;
-    // current c matrix step block coordinate
-    int stepMIdx_, stepNIdx_;
-
-    bool enHF32Mode_;
-    int32_t hf32TransMode_;
-    uint8_t subBlockIdx_;
-
-    int baseMK_;
-    int baseKN_;
-    int baseMN_;
-
-    int cacheA1Size_, cacheB1Size_;
-    int depthA1_, depthB1_;
-    uint64_t dataPtr_;
-    uint64_t tilingPtr_;
-
-    int curCacheIdx_;
-    GlobalCache gL1GroupCache0_;
-    GlobalCache gL1GroupCache1_;
-};
-
-/* **************************************************************************************************
- * MatmulParams                                             *
- * ************************************************************************************************* */
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, MatmulVersion MM_VER,
-    class ENABLE_QUANT = void>
-struct MatmulParams {
-    __aicore__ inline MatmulParams(){};
-};
-
-// CFG_NORM
-#if __CCE_AICORE__ >= 220
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
-    typename std::enable_if<!((
-        IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, half>::value) ||
-        (IsSameType<typename A_TYPE::T, int8_t>::value &&
-        (IsSameType<typename C_TYPE::T, int8_t>::value ||
-        IsSameType<typename C_TYPE::T, uint8_t>::value)))>::type> {
-    __aicore__ inline MatmulParams(){};
-    using PARAMS = MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-#else
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
-    typename std::enable_if<!(
-        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, half>::value) ||
-        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, int8_t>::value))>::type> {
-    __aicore__ inline MatmulParams(){};
-    using PARAMS = MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-#endif
-
-#if __CCE_AICORE__ >= 220
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
-    typename std::enable_if<((IsSameType<typename A_TYPE::T, int8_t>::value &&
-                            IsSameType<typename C_TYPE::T, half>::value) ||
-                            (IsSameType<typename A_TYPE::T, int8_t>::value &&
-                            (IsSameType<typename C_TYPE::T, int8_t>::value ||
-                            IsSameType<typename C_TYPE::T, uint8_t>::value)))>::type> {
-    __aicore__ inline MatmulParams(){};
-    using PARAMS = MatmulParamsNormQuant<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-#else
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
-    typename std::enable_if<(
-        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, half>::value) ||
-        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, int8_t>::value))>::type> {
-    __aicore__ inline MatmulParams(){};
-    using PARAMS = MatmulParamsNormQuant<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-#endif
-
-// CFG_MDL
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
-    __aicore__ inline MatmulParams() {};
-    using PARAMS = MatmulParamsMDL<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-
-// MM_CFG_BB
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG_BB)> {
-    __aicore__ inline MatmulParams() {};
-    using PARAMS = MatmulParamsBasicBlock<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-
-// CFG_IBSHARE_NORM
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_IBSHARE_NORM)> {
-    __aicore__ inline MatmulParams() {};
-    using PARAMS = MatmulParamsIBShareNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-};
-
-/* **************************************************************************************************
- * MatmulMacroImpl                                             *
- * ************************************************************************************************* */
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, MatmulVersion MM_VER>
-struct MatmulMacroImpl {
-    __aicore__ inline MatmulMacroImpl() {};
-};
-
-#if __CCE_AICORE__ >= 220
-// CFG_NORM
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
-        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
-    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
-        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
-};
-// CFG_MDL
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
-        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
-    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
-        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
-};
-// CFG_IBSHARE_NORM
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_IBSHARE_NORM)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    static constexpr uint16_t GEMV_MODE = (A_TYPE::format == CubeFormat::VECTOR) ? 1 :
-        ((A_TYPE::format == CubeFormat::SCALAR) ? 2 : 0);
-    using PARAMS = MacroMatmul<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
-        EnUnitFlag(MM_CFG), 0, GEMV_MODE>;
-};
-#elif __CCE_AICORE__ == 200
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    using PARAMS = MacroMatmulV200<L0cT, typename A_TYPE::T, typename A_TYPE::T>;
-};
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    using PARAMS = MacroMatmulV200<L0cT, typename A_TYPE::T, typename A_TYPE::T>;
-};
-#endif
-
-// MM_CFG_BB
-template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
-struct MatmulMacroImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG_BB)> {
-    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
-    __aicore__ inline MatmulMacroImpl() {};
-    using PARAMS = MacroMatmulBasic<L0cT, typename A_TYPE::T, typename B_TYPE::T, typename BIAS_TYPE::T,
-        BIAS_TYPE::isTrans, MM_CFG>;
-};
-
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>>
-class MatmulImpl {
+class MatmulImpl
+: MATMUL_IMPORT_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait<MM_CFG>::iterCtrlCfg)
+, MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE)
+{
+public:
+    using AType = A_TYPE;
+    using BType = B_TYPE;
+    using CType = C_TYPE;
+    using BiasType = BIAS_TYPE;
+private:
     using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
     using SrcT = typename A_TYPE::T;
     using SrcAT = typename A_TYPE::T;
@@ -697,6 +195,12 @@ public:
     uint32_t b1LoadCacheCount_ = 0;
 #endif
 
+public:
+    using ENTITY = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>;
+
+    MATMUL_USE_IMPORTED_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait<MM_CFG>::iterCtrlCfg);
+    MATMUL_USE_IMPORTED_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE);
+
 private:
     template <class A_TYPE_, class B_TYPE_, class C_TYPE_, class BIAS_TYPE_, const MatmulConfig &MM_CFG_>
     friend __aicore__ inline void SetTPipe(
@@ -833,12 +337,12 @@ private:
 	const int row, const int col, const int height, const int width, const int gCol);
 
     // do ping when isPong = flase, do pong when isPong = true
-    __aicore__ inline TBufHandle GetCacheA1Buf(bool isPong);
-    __aicore__ inline TBufHandle GetCacheB1Buf(bool isPong);
-    __aicore__ inline bool GetCacheA1IsCaching(bool isPong);
-    __aicore__ inline bool GetCacheB1IsCaching(bool isPong);
-    __aicore__ inline void SetCacheA1Buf(bool isPong, TBufHandle buf);
-    __aicore__ inline void SetCacheB1Buf(bool isPong, TBufHandle buf);
+    __aicore__ inline auto& GetCacheA1(bool isPong);
+    __aicore__ inline auto& GetCacheB1(bool isPong);
+    __aicore__ inline bool GetCacheA1IsCaching(bool isPong) const;
+    __aicore__ inline bool GetCacheB1IsCaching(bool isPong) const;
+    __aicore__ inline void SetCacheA1(bool isPong, const LocalTensor<SrcT>& a1);
+    __aicore__ inline void SetCacheB1(bool isPong, const LocalTensor<SrcT>& b1);
     __aicore__ inline void SetCacheA1IsCaching(bool isPong, bool isCaching);
     __aicore__ inline void SetCacheB1IsCaching(bool isPong, bool isCaching);
 
diff --git a/lib/matmul/tiling.h b/lib/matmul/tiling.h
index 609f1753..7e42d91f 100644
--- a/lib/matmul/tiling.h
+++ b/lib/matmul/tiling.h
@@ -128,6 +128,48 @@ struct MatmulConfig {
     bool enableDoubleCache;
 };
 
+enum class MatmulConfigMode {
+    CONFIG_NORM,
+    CONFIG_MDL,
+    CONFIG_SPECIALMDL,
+    CONFIG_IBSHARE
+};
+
+struct MatmulShapeParams {
+    uint32_t singleCoreM;
+    uint32_t singleCoreN;
+    uint32_t singleCoreK;
+    uint32_t basicM;
+    uint32_t basicN;
+    uint32_t basicK;
+};
+
+struct MatmulQuantParams {
+    bool isPerTensor;
+    bool hasAntiQuantOffset;
+};
+
+struct MatmulBatchParams {
+    bool isNBatch;
+    BatchMode batchMode;
+};
+
+struct MatmulFuncParams {
+    bool intrinsicsCheck;
+    bool enVecND2NZ;
+    uint32_t doMTE2Preload;
+    bool enableQuantVector = true;
+    bool enableSetDefineData = true;
+    uint8_t iterateMode = IterateMode::ITERATE_MODE_DEFAULT;
+    bool enableReuse = true;
+    bool enableUBReuse;
+    bool enableL1CacheUB;
+    bool intraBlockPartSum = false;
+    IterateOrder iterateOrder;
+    ScheduleType scheduleType;
+    bool enableDoubleCache;
+};
+
 __aicore__ constexpr MatmulConfig GetNormalConfig(const bool intrinsicsLimit = false, const bool batchLoop = false,
     const bool isVecND2NZ = false, const BatchMode bmmMode = BatchMode::BATCH_LESS_THAN_L1,
     const bool isMsgReuse = true, const IterateOrder iterateOrder = IterateOrder::UNDEF,
@@ -424,6 +466,58 @@ constexpr MatmulConfig CFG_MDL = GetMDLConfig();
 constexpr MatmulConfig MM_CFG_BB = GetBasicConfig(128, 128, 128);
 constexpr MatmulConfig CFG_IBSHARE_NORM = GetIBShareNormConfig();
 
+template <MatmulConfigMode configMode, typename... ArgTypes>
+__aicore__ inline constexpr MatmulConfig GetMMConfig(ArgTypes&&... args) {
+    MatmulConfig mmConfig = CFG_NORM;
+    if constexpr (configMode == MatmulConfigMode::CONFIG_MDL) {
+        mmConfig = CFG_MDL;
+    } else if constexpr (configMode == MatmulConfigMode::CONFIG_SPECIALMDL) {
+        mmConfig = GetSpecialMDLConfig();
+    } else if constexpr (configMode == MatmulConfigMode::CONFIG_IBSHARE) {
+        mmConfig = CFG_IBSHARE_NORM;
+    }
+    GetMMConfigImpl(mmConfig, args...);
+    return mmConfig;
+}
+
+template <typename T, typename... ArgTypes>
+__aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, T arg, ArgTypes&&... args) {
+    GetMMConfigImpl(cfg, arg);
+    GetMMConfigImpl(cfg, args...);
+}
+
+template <typename ArgType>
+__aicore__ inline constexpr void GetMMConfigImpl(MatmulConfig& cfg, ArgType arg) {
+    if constexpr (AscendC::IsSameType<ArgType, MatmulShapeParams>::value) {
+        cfg.singleCoreM = arg.singleCoreM;
+        cfg.singleCoreN = arg.singleCoreN;
+        cfg.singleCoreK = arg.singleCoreK;
+        cfg.basicM = arg.basicM;
+        cfg.basicN = arg.basicN;
+        cfg.basicK = arg.basicK;
+    } else if constexpr (AscendC::IsSameType<ArgType, MatmulQuantParams>::value) {
+        cfg.isPerTensor = arg.isPerTensor;
+        cfg.hasAntiQuantOffset = arg.hasAntiQuantOffset;
+    } else if constexpr (AscendC::IsSameType<ArgType, MatmulBatchParams>::value) {
+        cfg.isNBatch = arg.isNBatch;
+        cfg.batchMode = arg.batchMode;
+    } else if constexpr (AscendC::IsSameType<ArgType, MatmulFuncParams>::value) {
+        cfg.intrinsicsCheck = arg.intrinsicsCheck;
+        cfg.enVecND2NZ = arg.enVecND2NZ;
+        cfg.doMTE2Preload = arg.doMTE2Preload;
+        cfg.enableQuantVector = arg.enableQuantVector;
+        cfg.enableSetDefineData = arg.enableSetDefineData;
+        cfg.iterateMode = arg.iterateMode;
+        cfg.enableReuse = arg.enableReuse;
+        cfg.enableUBReuse = arg.enableUBReuse;
+        cfg.enableL1CacheUB = arg.enableL1CacheUB;
+        cfg.intraBlockPartSum = arg.intraBlockPartSum;
+        cfg.iterateOrder = arg.iterateOrder;
+        cfg.scheduleType = arg.scheduleType;
+        cfg.enableDoubleCache = arg.enableDoubleCache;
+    }
+}
+
 struct MatrixOffset {
     int32_t offset;
     int32_t row, col;
-- 
Gitee


From eb2fbd0b807b706500c81fe25ee61aaf2f176047 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 04:16:58 +0000
Subject: [PATCH 2/8] add llt case adapt code hub

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 tests/CMakeLists.txt                          |    2 +
 tests/main_global.cpp                         |    2 +-
 tests/matmul/test_matmul_config.cpp           |   61 +
 tests/matmul/test_matmul_input_l1_cache.cpp   |   73 +
 .../matmul/test_matmul_iterate_controller.cpp |  220 ++
 tests/matmul/test_matmul_l0c_buffer.cpp       |  106 +
 tests/tiling/test_tiling.cpp                  | 2086 ++++++++++++++++-
 7 files changed, 2465 insertions(+), 85 deletions(-)
 create mode 100644 tests/matmul/test_matmul_config.cpp
 create mode 100644 tests/matmul/test_matmul_input_l1_cache.cpp
 create mode 100644 tests/matmul/test_matmul_iterate_controller.cpp
 create mode 100644 tests/matmul/test_matmul_l0c_buffer.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6387a70c..57d343e0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -49,6 +49,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_quant.cpp
     ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_quant_per_channel.cpp
     # ${ASCENDC_TESTS_DIR}/quantization/dequant/test_operator_dequant_v200.cpp
+    ${ASCENDC_TESTS_DIR}/matmul/test_matmul_config.cpp
 )
 
 # ascend910B1 aiv test cases
@@ -81,6 +82,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
 # ascend910B1 aic test cases
 file(GLOB ASCENDC_TEST_ascend910B1_AIC_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/matmul/test_operator_matmul_v220.cpp
+    ${ASCENDC_TESTS_DIR}/matmul/test_matmul_config.cpp
 )
 
 # ascend310B1 test cases
diff --git a/tests/main_global.cpp b/tests/main_global.cpp
index 167def52..16b1b816 100644
--- a/tests/main_global.cpp
+++ b/tests/main_global.cpp
@@ -33,6 +33,6 @@ __aicore__ AscendC::TPipe* GetTPipePtr()
 #else
     return g_tPipePtr;
 #endif
-    ASSERT(false && "Only supported ascend910B1, ascend910, ascend310P");
+    ASSERT(false && "Only supported ascend910B1, ascend910, ascend310p");
     return nullptr;
 }
\ No newline at end of file
diff --git a/tests/matmul/test_matmul_config.cpp b/tests/matmul/test_matmul_config.cpp
new file mode 100644
index 00000000..33dce3b3
--- /dev/null
+++ b/tests/matmul/test_matmul_config.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * @brief load data instruction ut for ascend910B1
+ *
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+#include "lib/matmul/tiling.h"
+#include "impl/matmul/modules/matmul_param.h"
+#include "impl/matmul/modules/input_cache/matmul_input_l1_cache.h"
+
+using namespace std;
+using namespace AscendC;
+using namespace matmul;
+
+#include "matmul_module_test_def.h"
+
+class TestMatmulConfig : public testing::Test {
+protected:
+    static void SetUpTestCase() {}
+    static void TearDownTestCase() {}
+    virtual void SetUp() {}
+    void TearDown() {}
+};
+
+TEST_F(TestMatmulConfig, TestParamsConfig)
+{
+    constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;
+    constexpr static MatmulShapeParams shapeParams{128, 128, 128, 64, 64, 64};
+    constexpr static MatmulQuantParams quantParams{1, 1};
+    constexpr static MatmulBatchParams batchParams{1, BatchMode::BATCH_LARGE_THAN_L1};
+    constexpr static MatmulFuncParams funcParams{1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        IterateOrder::ORDER_N, ScheduleType::OUTER_PRODUCT, 1};
+    constexpr MatmulConfig mmConfig = GetMMConfig<configMode>(shapeParams, quantParams, batchParams, funcParams);
+
+    EXPECT_EQ((uint32_t)mmConfig.doNorm, 1);
+    EXPECT_EQ((uint32_t)mmConfig.singleCoreM, 128);
+    EXPECT_EQ((uint32_t)mmConfig.singleCoreN, 128);
+    EXPECT_EQ((uint32_t)mmConfig.singleCoreK, 128);
+    EXPECT_EQ((uint32_t)mmConfig.basicM, 64);
+    EXPECT_EQ((uint32_t)mmConfig.basicN, 64);
+    EXPECT_EQ((uint32_t)mmConfig.basicK, 64);
+    EXPECT_EQ((uint32_t)mmConfig.isPerTensor, 1);
+    EXPECT_EQ((uint32_t)mmConfig.hasAntiQuantOffset, 1);
+    EXPECT_EQ((uint32_t)mmConfig.isNBatch, 1);
+    EXPECT_EQ((uint32_t)mmConfig.batchMode, 2);
+    EXPECT_EQ((uint32_t)mmConfig.intrinsicsCheck, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enVecND2NZ, 1);
+    EXPECT_EQ((uint32_t)mmConfig.doMTE2Preload, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableQuantVector, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableSetDefineData, 1);
+    EXPECT_EQ((uint32_t)mmConfig.iterateMode, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableReuse, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableUBReuse, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableL1CacheUB, 1);
+    EXPECT_EQ((uint32_t)mmConfig.intraBlockPartSum, 1);
+    EXPECT_EQ((uint32_t)mmConfig.iterateOrder, 1);
+    EXPECT_EQ((uint32_t)mmConfig.scheduleType, 1);
+    EXPECT_EQ((uint32_t)mmConfig.enableDoubleCache, 1);
+}
\ No newline at end of file
diff --git a/tests/matmul/test_matmul_input_l1_cache.cpp b/tests/matmul/test_matmul_input_l1_cache.cpp
new file mode 100644
index 00000000..5f406f60
--- /dev/null
+++ b/tests/matmul/test_matmul_input_l1_cache.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * @brief load data instruction ut for ascend910B1
+ *
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+#include "lib/matmul/tiling.h"
+#include "impl/matmul/modules/matmul_param.h"
+#include "impl/matmul/modules/input_cache/matmul_input_l1_cache.h"
+
+using namespace std;
+using namespace AscendC;
+using namespace matmul;
+
+#include "matmul_module_test_def.h"
+
+namespace {
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+class MatmulImpl
+: MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE) {
+public:
+    using VAR_PARAMS =
+        typename MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS;
+
+    MatmulImpl() {
+        InitVar();
+    }
+
+    VAR_PARAMS& GetVar() {
+        return var;
+    }
+
+    void AllocATensor() {
+        var.cacheHeadA1_ = var.qidA1Cache_.template AllocTensor<typename A_TYPE::T>();
+    }
+
+    void AllocBTensor() {
+        var.cacheHeadB1_ = var.qidB1Cache_.template AllocTensor<typename B_TYPE::T>();
+    }
+
+    void InitVar() {
+        var.tiling_ = &tiling;
+        var.tpipe_ = &pipe;
+    }
+
+private:
+    TCubeTiling tiling;
+    TPipe pipe;
+    VAR_PARAMS var;
+};
+}
+
+class test_matmul_input_l1_cache : public testing::Test {
+protected:
+    void SetUp() {}
+    void TearDown() {}
+
+private:
+    using A_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+    using B_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+    using C_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+    using BIAS_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM, void> mm;
+};
+
+TEST_F(test_matmul_input_l1_cache, first_iter_order_M) {
+    mm.ClearAL1Cache();
+    mm.ClearBL1Cache();
+}
\ No newline at end of file
diff --git a/tests/matmul/test_matmul_iterate_controller.cpp b/tests/matmul/test_matmul_iterate_controller.cpp
new file mode 100644
index 00000000..0abf29d8
--- /dev/null
+++ b/tests/matmul/test_matmul_iterate_controller.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * @brief load data instruction ut for ascend910B1
+ *
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+#include "lib/matmul/tiling.h"
+#include "impl/matmul/modules/feature_trait/matmul_feature_trait.h"
+#include "impl/matmul/modules/iterator/matmul_iterate_controller.h"
+
+using namespace std;
+using namespace AscendC;
+using namespace matmul;
+
+#include "matmul_module_test_def.h"
+
+using A_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+using B_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+using C_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+using BIAS_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+
+template<typename ENTITY> 
+class MatmulInputL1Cache<ENTITY, A_TYPE, B_TYPE> {
+public:
+    void ClearAL1Cache() {
+        clearedACount++;
+    }
+
+    void ClearBL1Cache() {
+        clearedBCount++;
+    }
+
+public:
+    uint32_t clearedACount {0};
+    uint32_t clearedBCount {0};
+};
+
+namespace {
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+class MatmulImpl
+: MATMUL_IMPORT_MODULE(MatmulIterateController, A_TYPE, B_TYPE, MatmulFeatureTrait<MM_CFG>::iterCtrlCfg)//MatmulFeatureTrait<MM_CFG>::iterCtrlCfg)
+, MATMUL_IMPORT_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE) {
+    using VAR_PARAMS =
+        typename MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS;
+
+public:
+    MatmulImpl() {
+        InitVar();
+    }
+
+    void SetTiling(IterateOrder order, int32_t stepM, uint32_t stepN) {
+        tiling.iterateOrder = static_cast<int32_t>(order);
+        tiling.stepM = stepM;
+        tiling.stepN = stepN;
+
+        this->Reset();
+    }
+
+    void SetMParams(int32_t curPos, int32_t iter, int32_t stepIdx,  int32_t curStep) {
+        var.curM_ = curPos;
+        var.mIter_ = iter;
+        var.stepMIdx_ = stepIdx;
+        var.curStepM_ = curStep;
+    }
+
+    void SetNParams(int32_t curPos, int32_t iter, int32_t stepIdx,  int32_t curStep) {
+        var.curN_ = curPos;
+        var.nIter_ = iter;
+        var.stepNIdx_ = stepIdx;
+        var.curStepN_ = curStep;
+    }
+
+    VAR_PARAMS& GetVar() {
+        return var;
+    }
+
+    void AllocATensor() {
+        var.cacheHeadA1_ = var.qidA1Cache_.template AllocTensor<typename A_TYPE::T>();
+    }
+
+    void AllocBTensor() {
+        var.cacheHeadB1_ = var.qidB1Cache_.template AllocTensor<typename B_TYPE::T>();
+    }
+
+    void InitVar() {
+        var.tiling_ = &tiling;
+        var.tpipe_ = &pipe;
+    }
+
+private:
+    TCubeTiling tiling;
+    TPipe pipe;
+    VAR_PARAMS var;
+};
+}
+
+class test_matmul_iterator_controller : public testing::Test {
+protected:
+    void SetUp() {}
+    void TearDown() {}
+
+private:
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM, void> mm;
+};
+
+TEST_F(test_matmul_iterator_controller, first_iter_order_M) {
+    mm.SetTiling(IterateOrder::ORDER_M, 4, 2);
+    mm.SetMParams(0, 4, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    
+    bool isFinished = mm.MoveNext();
+
+    ASSERT_TRUE(isFinished);
+
+    ASSERT_EQ(mm.GetVar().curStepM_, 4);
+    ASSERT_EQ(mm.GetVar().curM_, 0);
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+}
+
+TEST_F(test_matmul_iterator_controller, first_iter_order_N) {
+    mm.SetTiling(IterateOrder::ORDER_N, 4, 2);
+    mm.SetMParams(0, 4, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    
+    bool isFinished = mm.MoveNext();
+
+    ASSERT_TRUE(isFinished);
+
+    ASSERT_EQ(mm.GetVar().curStepN_, 2);
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+}
+
+TEST_F(test_matmul_iterator_controller, order_M_iter_four_times) {
+    mm.SetTiling(IterateOrder::ORDER_M, 4, 2);
+    mm.SetMParams(0, 4, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    int32_t cnt = 0;
+    while(mm.MoveNext()) {
+        cnt++;
+    }
+
+    ASSERT_EQ(cnt, 8);
+}
+
+TEST_F(test_matmul_iterator_controller, order_N_iter_four_times) {
+    mm.SetTiling(IterateOrder::ORDER_N, 4, 2);
+    mm.SetMParams(0, 4, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    int32_t cnt = 0;
+    while(mm.MoveNext()) {
+        cnt++;
+    }
+
+    ASSERT_EQ(cnt, 8);
+}
+
+
+TEST_F(test_matmul_iterator_controller, order_M_iter_twice) {
+    mm.SetTiling(IterateOrder::ORDER_M, 4, 2);
+    mm.SetMParams(0, 1, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    auto isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+    isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curN_, 1);
+    ASSERT_TRUE(isFinished);
+    isFinished = mm.MoveNext();
+    ASSERT_FALSE(isFinished);
+    ASSERT_EQ(mm.GetVar().curM_, 0);
+}
+
+TEST_F(test_matmul_iterator_controller, order_N_iter_twice) {
+    mm.SetTiling(IterateOrder::ORDER_N, 4, 2);
+    mm.SetMParams(0, 2, 0, 0);
+    mm.SetNParams(0, 1, 0, 0);
+    auto isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curM_, 0);
+    isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curM_, 1);
+    ASSERT_TRUE(isFinished);
+    isFinished = mm.MoveNext();
+    ASSERT_FALSE(isFinished);
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+}
+
+// test when n-dimension is finished in OrderM case
+TEST_F(test_matmul_iterator_controller, order_M_n_is_finished) {
+    mm.SetTiling(IterateOrder::ORDER_M, 4, 2);
+    mm.SetMParams(0, 2, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    // first iter
+    auto isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curM_, 0);
+    // n-dimension is finished
+    isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curN_, 1);
+    ASSERT_TRUE(isFinished);
+    (void)mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+    ASSERT_EQ(mm.GetVar().curM_, 1);
+}
+
+// test when m-dimension is finished in OrderN case
+TEST_F(test_matmul_iterator_controller, order_N_m_is_finished) {
+    mm.SetTiling(IterateOrder::ORDER_N, 4, 2);
+    mm.SetMParams(0, 2, 0, 0);
+    mm.SetNParams(0, 2, 0, 0);
+    // first iter
+    auto isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curN_, 0);
+    // n-dimension is finished
+    isFinished = mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curM_, 1);
+    ASSERT_TRUE(isFinished);
+    (void)mm.MoveNext();
+    ASSERT_EQ(mm.GetVar().curM_, 0);
+    ASSERT_EQ(mm.GetVar().curN_, 1);
+}
\ No newline at end of file
diff --git a/tests/matmul/test_matmul_l0c_buffer.cpp b/tests/matmul/test_matmul_l0c_buffer.cpp
new file mode 100644
index 00000000..67e8c242
--- /dev/null
+++ b/tests/matmul/test_matmul_l0c_buffer.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
+ *
+ * @brief load data instruction ut for ascend910B1
+ *
+ */
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+#include "lib/matmul/tiling.h"
+#include "impl/matmul/modules/matmul_param.h"
+#include "impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h"
+
+using namespace std;
+using namespace AscendC;
+using namespace matmul;
+
+#include "matmul_module_test_def.h"
+
+namespace {
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB>
+class MatmulImpl
+: MATMUL_IMPORT_MODULE(MatmulL0CBuffer, EnUnitFlag(MM_CFG)) {
+    using VAR_PARAMS =
+        typename MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG)>::PARAMS;
+
+public:
+    MatmulImpl() {
+        InitVar();
+    }
+
+    VAR_PARAMS& GetVar() {
+        return var;
+    }
+
+    void SetParamKIter(int k) {
+        var.kIter_ = k;
+    }
+
+    void InitVar() {
+        var.tiling_ = &tiling;
+        var.tpipe_ = &pipe;
+        var.baseMN_ = 1024;
+    }
+
+private:
+    TCubeTiling tiling;
+    TPipe pipe;
+    VAR_PARAMS var;
+};
+}
+
+class test_matmul_l0c_buffer : public testing::Test {
+protected:
+    void SetUp() {}
+    void TearDown() {}
+
+private:
+    using L0cT = float;
+
+    using A_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+    using B_TYPE = matmul::MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
+    using C_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, L0cT>;
+    using BIAS_TYPE = matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM, void> enUnitFlagMM;
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL, void> disUnitFlagMM;
+};
+
+TEST_F(test_matmul_l0c_buffer, enable_unit_flag) {
+
+    enUnitFlagMM.InitL0CBuffer<L0cT>();
+    auto co1Local = enUnitFlagMM.LoadL0CTensor<L0cT>();
+    auto co1Local1 = enUnitFlagMM.AllocL0CLocalTensor<L0cT>();
+    enUnitFlagMM.FreeL0CLocalTensor<L0cT>(co1Local1);
+    enUnitFlagMM.ResetL0CEventStatus();
+
+    enUnitFlagMM.SetParamKIter(1);
+    ASSERT_TRUE(enUnitFlagMM.IsL0CLastIter(0));
+
+    ASSERT_EQ(enUnitFlagMM.GetMmadUnitFlagCtrl(true), 3);
+    ASSERT_EQ(enUnitFlagMM.GetMmadUnitFlagCtrl(false), 2);
+
+    FixpipeParams<L0cT> fixpipeParams;
+    enUnitFlagMM.SetFixpipeUnitFlag(fixpipeParams);
+    ASSERT_EQ(fixpipeParams.unitFlag, 3);
+}
+
+TEST_F(test_matmul_l0c_buffer, disable_unit_flag) {
+
+    disUnitFlagMM.InitL0CBuffer<L0cT>();
+    disUnitFlagMM.GetVar().cMatrix_ = disUnitFlagMM.LoadL0CTensor<L0cT>();
+    auto co1Local1 = disUnitFlagMM.AllocL0CLocalTensor<L0cT>();
+    disUnitFlagMM.FreeL0CLocalTensor<L0cT>(co1Local1);
+    disUnitFlagMM.ResetL0CEventStatus();
+
+    disUnitFlagMM.SetParamKIter(1);
+    ASSERT_FALSE(disUnitFlagMM.IsL0CLastIter(0));
+
+    ASSERT_EQ(disUnitFlagMM.GetMmadUnitFlagCtrl(true), 0);
+    ASSERT_EQ(disUnitFlagMM.GetMmadUnitFlagCtrl(false), 0);
+
+    FixpipeParams<L0cT> fixpipeParams;
+    disUnitFlagMM.SetFixpipeUnitFlag(fixpipeParams);
+    ASSERT_EQ(fixpipeParams.unitFlag, 0);
+}
\ No newline at end of file
diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index 219b8605..d9cfa951 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -77,6 +77,25 @@ TEST_F(TestTiling, PlatformConstructor)
     EXPECT_EQ(ret, 0);
 }
 
+TEST_F(TestTiling, TestInt4BaseK)
+{
+    matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288,
+        .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536};
+    MatmulApiTiling tiling(plat);
+    tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
+    tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
+    tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32);
+    tiling.SetBias(false);
+    tiling.SetShape(144, 256, 32);
+    tiling.SetOrgShape(144, 256, 32);
+    tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1);
+    optiling::TCubeTiling tilingData;
+    int ret = tiling.GetTiling(tilingData);
+    tiling.PrintTilingData();
+    EXPECT_EQ(tilingData.get_baseK() % 64, 0);
+    EXPECT_EQ(ret, 0);
+}
+
 TEST_F(TestTiling, Tiling_310p_NotAligned)
 {
     matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576,
@@ -723,6 +742,105 @@ TEST_F(TestTiling, TestSetBufferSpace)
     EXPECT_EQ(tiling.bufferPool_.l1Size, 1024);
 }
 
+TEST_F(TestTiling, TestCosTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto cosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 3);
+    AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 2);
+    uint32_t maxLiveNodeCnt = 0;
+    uint32_t extraBuf = 0;
+    GetCosTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestCosTilingFloat512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto cosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 3);
+    AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestCosTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto cosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetCosMaxMinTmpSize(cosShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 8 * 2);
+    EXPECT_EQ(minValue, 256 * 8);
+    uint32_t maxLiveNodeCnt = 0;
+    uint32_t extraBuf = 0;
+    GetCosTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 8);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAtanTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto atanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAtanMaxMinTmpSize(atanShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 5);
+    EXPECT_EQ(minValue, 256 * 5);
+}
+
+TEST_F(TestTiling, TestAtanTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto atanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAtanMaxMinTmpSize(atanShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 12);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 12);
+    uint32_t maxLiveNodeCnt = 0;
+    uint32_t extraBuf = 0;
+    GetAtanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 12);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestClampTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto atanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetClampMaxMinTmpSize(atanShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 1);
+    EXPECT_EQ(minValue, 64 * 1);
+}
+
+TEST_F(TestTiling, TestClampTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto atanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetClampMaxMinTmpSize(atanShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 1);
+    uint32_t maxLiveNodeCnt = 0;
+    uint32_t extraBuf = 0;
+    GetClampTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
 TEST_F(TestTiling, TestSoftMaxTiling)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
@@ -836,7 +954,6 @@ TEST_F(TestTiling, TestSoftMaxFlashV2Tiling)
     SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true);
     EXPECT_EQ(tilingData.get_reduceM(), 64);
 }
-
 TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock)
 {
     std::vector<int64_t> shapeDims = { 8, 1024 };
@@ -865,6 +982,163 @@ TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock)
     EXPECT_EQ(tilingData.get_reduceM(), 8);
 }
 
+TEST_F(TestTiling, TestAsinTmpBufferFacotrHalfWithoutBasicBlock) {
+    uint32_t maxLivedNodes = 0xffff;
+    uint32_t extraBuffer = 0xffff;
+    GetAsinTmpBufferFactorSize(2, maxLivedNodes, extraBuffer);
+    EXPECT_EQ(maxLivedNodes, 6);
+    EXPECT_EQ(extraBuffer, 0);
+}
+
+TEST_F(TestTiling, TestAsinTmpBufferFacotrFloatWithoutBasicBlock) {
+    uint32_t maxLivedNodes = 0xffff;
+    uint32_t extraBuffer = 0xffff;
+    GetAsinTmpBufferFactorSize(4, maxLivedNodes, extraBuffer);
+    EXPECT_EQ(maxLivedNodes, 2);
+    EXPECT_EQ(extraBuffer, 0);
+}
+
+TEST_F(TestTiling, TestAsinTilingHalf128)
+{
+    std::vector<int64_t> shapeDims = { 128 };
+    auto asinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 256 * 6);
+    EXPECT_EQ(minValue, 256 * 6);
+}
+
+TEST_F(TestTiling, TestAsinTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 32 };
+    auto asinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 256 * 2);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestAsinTilingHalf16K)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto asinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 6 * 2);
+    EXPECT_EQ(minValue, 256 * 6);
+}
+
+TEST_F(TestTiling, TestAsinTilingFloat16K)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto asinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestSinhTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto sinhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetSinhMaxMinTmpSize(sinhShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetSinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestSinhTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto sinhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetSinhMaxMinTmpSize(sinhShape, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 4);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 2);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetSinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestRoundTiling)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto tanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    uint32_t maxLiveNodeCnt = 0;
+    uint32_t extraBuf = 0;
+    platform_ascendc::SocVersion socVersion = plat.GetSocVersion();
+    GetRoundMaxMinTmpSize(plat, tanShape, 4, false, maxValue, minValue);
+    GetRoundTmpBufferFactorSize(plat, 4, maxLiveNodeCnt, extraBuf);
+    GetRoundMaxMinTmpSize(plat, tanShape, 2, false, maxValue, minValue);
+    GetRoundTmpBufferFactorSize(plat, 2, maxLiveNodeCnt, extraBuf);
+}
+
+TEST_F(TestTiling, TestTanTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto tanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 4);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestTanTilingFloat512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto tanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 4);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestTanTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto tanShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetTanMaxMinTmpSize(tanShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 10 * 2);
+    EXPECT_EQ(minValue, 256 * 10);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetTanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 10);
+    EXPECT_EQ(extraBuf, 0);
+}
+
 TEST_F(TestTiling, TEstSwiGLUTilingHalf)
 {
     std::vector<int64_t> shapeDims = {10, 512};
@@ -920,103 +1194,530 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf)
     EXPECT_EQ(extraBuf, 0);
 }
 
-TEST_F(TestTiling, TestSigmoidTiling)
+TEST_F(TestTiling, TestFmodTilingFloat)
 {
-    std::vector<int64_t> shapeDims = { 128 };
-    auto sigmoidShape = ge::Shape(shapeDims);
-    uint32_t maxVal;
-    uint32_t minVal;
-    GetSigmoidMaxMinTmpSize(sigmoidShape, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 128 * 4);
-    EXPECT_EQ(minVal, 256);
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto fmodShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFmodMaxMinTmpSize(fmodShape, 4, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
 }
 
-TEST_F(TestTiling, TestLayernormTiling)
+TEST_F(TestTiling, TestFmodTilingHalf)
 {
-    const uint32_t stackBufferSize = 100 * 1024;
-    const uint32_t typeSize = 4;
-
-    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
-    auto layernormShape = ge::Shape(shapeDims);
-    const bool isReuseSource = false;
-    optiling::LayerNormTiling tilling;
-
-    uint32_t minValue = 0;
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto fmodShape = ge::Shape(shapeDims);
     uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFmodMaxMinTmpSize(fmodShape, 2, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 128 * 3 * 4);
+    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
+}
 
-    AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue);
-    EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize);
-    EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize);
+TEST_F(TestTiling, TestTruncTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto truncShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetTruncMaxMinTmpSize(truncShape, 4, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
 
-    AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling);
-    EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetTruncTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
 }
 
-TEST_F(TestTiling, TestRmsnormTiling)
+TEST_F(TestTiling, TestTruncTilingHalf)
 {
-    constexpr uint32_t bLength = 4;
-    constexpr uint32_t sLength = 32;
-    constexpr uint32_t hLength = 16;
-    constexpr uint32_t bsLength = bLength * sLength;
-    constexpr uint32_t bshLength = bLength * sLength * hLength;
-    std::vector<int64_t> shapeDims = {bLength, sLength, hLength};
-    auto shape = ge::Shape(shapeDims);
-    constexpr uint32_t typeSize = 4;
-    constexpr uint32_t ONE_BLK_FLOAT = 8;
-
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto truncShape = ge::Shape(shapeDims);
     uint32_t maxValue = 0;
     uint32_t minValue = 0;
-    // common scene
-    bool res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue);
-    const uint32_t goldenMax = (bshLength + bsLength) * typeSize;
-    uint32_t goldenMin = (hLength + ONE_BLK_FLOAT) * typeSize;
-    EXPECT_EQ(res, true);
-    EXPECT_EQ(maxValue, goldenMax);
-    EXPECT_EQ(minValue, goldenMin);
+    GetTruncMaxMinTmpSize(truncShape, 2, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
 
-    // basic block scene 1: input shape is illegal, fail to get minSize
-    res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue, true);
-    EXPECT_EQ(res, false);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetTruncTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 2);
+    EXPECT_EQ(extraBuf, 0);
+}
 
-    constexpr uint32_t BASIC_BLK_HLENGTH = 64;
-    constexpr uint32_t BASIC_BLK_BSLENGTH = 8;
-    shapeDims[2] = BASIC_BLK_HLENGTH;
-    auto shape_basic_blk = ge::Shape(shapeDims);// 4,32,64
-    // basic block scene 2: get minSize successfully
-    res = AscendC::GetRmsNormMaxMinTmpSize(shape_basic_blk, typeSize, maxValue, minValue, true);
-    goldenMin = (64 + 8) * typeSize;
-    EXPECT_EQ(res, true);
-    EXPECT_EQ(minValue, goldenMin);
+TEST_F(TestTiling, TestTruncTilingHalf512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto truncShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 512 * 2 * 2);
+    EXPECT_EQ(minValue, 256 * 2);
+}
 
-    // basic block scene: get basic block using minTmpSize
-    // goldenMin should be (BASIC_BLK_HLENGTH(64) * BASIC_BLK_BSLENGTH(8) + bsLength) * typeSize
-    optiling::RmsNormTiling tiling;
-    uint32_t tmpSize = (64 + 8) * 4; // shape: 4,32,64
-    res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, minValue, typeSize, tiling, true);
-    EXPECT_EQ(res, true);
-    EXPECT_EQ(tiling.get_mainBshLength(), 64);
-    EXPECT_EQ(tiling.get_mainBsLength(), 1);
+TEST_F(TestTiling, TestAcosTmpBufferFacotrHalfWithoutBasicBlock) {
+    uint32_t maxLivedNodes = 0xffff;
+    uint32_t extraBuffer = 0xffff;
+    GetAcosTmpBufferFactorSize(2, maxLivedNodes, extraBuffer);
+    EXPECT_EQ(maxLivedNodes, 6);
+    EXPECT_EQ(extraBuffer, 0);
+}
 
-    auto shape1 = ge::Shape({1,7,16});
-    res = AscendC::GetRmsNormMaxMinTmpSize(shape1, typeSize, maxValue, minValue);
-    goldenMin = (8 + 16) * typeSize;
-    EXPECT_EQ(minValue, goldenMin);
 
-    uint32_t stackBufferSize = 100 * 1024;
-    // common scene: get tiling info successfully, shape: 1,7,16
-    res = AscendC::GetRmsNormTilingInfo(shape1, shape1, stackBufferSize, typeSize, tiling);
-    EXPECT_EQ(res, true);
-    EXPECT_EQ(tiling.get_mainBshLength(), 1*7*16);
-    EXPECT_EQ(tiling.get_mainBsLength(), 7);
+TEST_F(TestTiling, TestAcosTmpBufferFacotrFloatWithoutBasicBlock) {
+    uint32_t maxLivedNodes = 0xffff;
+    uint32_t extraBuffer = 0xffff;
+    GetAcosTmpBufferFactorSize(4, maxLivedNodes, extraBuffer);
+    EXPECT_EQ(maxLivedNodes, 2);
+    EXPECT_EQ(extraBuffer, 0);
+}
 
-    stackBufferSize = hLength;
-    // common scene: fail to get tiling info because of small stack buffer
-    res = AscendC::GetRmsNormTilingInfo(shape, shape, stackBufferSize, typeSize, tiling);
-    EXPECT_EQ(res, false);
+TEST_F(TestTiling, TestAcosTilingHalf128)
+{
+    std::vector<int64_t> shapeDims = { 128 };
+    auto acosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 6);
+    EXPECT_EQ(maxValue, 256 * 6);
+}
 
-    // basic block scene: get basic block tiling info successfully
-    stackBufferSize = 100 * 1024; // shape: 4,32,64
-    res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, stackBufferSize, typeSize, tiling, true);
+TEST_F(TestTiling, TestAcosTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 32 };
+    auto acosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+    EXPECT_EQ(maxValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestTanhTiling)
+{
+    uint32_t maxVal = 0;
+    uint32_t minVal = 0;
+    GetTanhMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 128 * 4 * 1);
+    EXPECT_EQ(minVal, 256 * 1);
+    GetTanhMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 4);
+    EXPECT_EQ(minVal, 256 * 4);
+    uint32_t extraBuf = 123;
+    uint32_t maxLivedNodesCnt = 123;
+    GetTanhTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf);
+    EXPECT_EQ(extraBuf, 0);
+    EXPECT_EQ(maxLivedNodesCnt, 1);
+    GetTanhTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf);
+    EXPECT_EQ(extraBuf, 0);
+    EXPECT_EQ(maxLivedNodesCnt, 4);
+}
+
+TEST_F(TestTiling, TestSigmoidTiling)
+{
+    std::vector<int64_t> shapeDims = { 128 };
+    auto sigmoidShape = ge::Shape(shapeDims);
+    uint32_t maxVal;
+    uint32_t minVal;
+    GetSigmoidMaxMinTmpSize(sigmoidShape, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 128 * 4);
+    EXPECT_EQ(minVal, 256);
+}
+
+TEST_F(TestTiling, TestLogTilingMaxMin)
+{
+    std::vector<int64_t> shapeDims = { 128 };
+    auto logShape = ge::Shape(shapeDims);
+    uint32_t maxVal;
+    uint32_t minVal;
+    GetLogMaxMinTmpSize(logShape, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 0);
+    EXPECT_EQ(minVal, 0);
+    GetLog2MaxMinTmpSize(logShape, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 0);
+    EXPECT_EQ(minVal, 0);
+    GetLog2MaxMinTmpSize(logShape, 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 4 * 128);
+    EXPECT_EQ(minVal, 256);
+    GetLog10MaxMinTmpSize(logShape, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 0);
+    EXPECT_EQ(minVal, 0);
+}
+
+TEST_F(TestTiling, TestLogTilingFactor)
+{
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetLogTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 0);
+    EXPECT_EQ(extraBuf, 0);
+    GetLog10TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 0);
+    EXPECT_EQ(extraBuf, 0);
+    GetLog2TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 2);
+    EXPECT_EQ(extraBuf, 0);
+    GetLog2TmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 0);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestPowerTiling)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto powerShape = ge::Shape(shapeDims);
+    uint32_t maxVal;
+    uint32_t minVal;
+    GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 4 * 4 + 256);
+    EXPECT_EQ(minVal, 256 * 4 + 256);
+    GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 4 * 6);
+    EXPECT_EQ(minVal, 256 * 6);
+    GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 2 * 14 + 256);
+    EXPECT_EQ(minVal, 256 * 7 + 256);
+    std::vector<int64_t> scalar_shape = { 1 };
+    auto scalarShape = ge::Shape(scalar_shape);
+    GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 2 * 14 + 256);
+    EXPECT_EQ(minVal, 256 * 7 + 256);
+    GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 4 * 7);
+    EXPECT_EQ(minVal, 256 * 7);
+    GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 512 * 4 * 5 + 256);
+    EXPECT_EQ(minVal, 256 * 5 + 256);
+
+    std::vector<int64_t> shape1 = { 16 };
+    auto powerShape1 = ge::Shape( shape1 );
+    GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 5 + 256);
+    EXPECT_EQ(minVal, 256 * 5 + 256);
+    GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 7 + 256);
+    EXPECT_EQ(minVal, 256 * 7 + 256);
+    GetPowerMaxMinTmpSize(powerShape1, scalarShape, true, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 7);
+    EXPECT_EQ(minVal, 256 * 7);
+    GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 4 + 256);
+    EXPECT_EQ(minVal, 256 * 4 + 256);
+    GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 7 + 256);
+    EXPECT_EQ(minVal, 256 * 7 + 256);
+    GetPowerMaxMinTmpSize(powerShape1, powerShape1, true, 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 6);
+    EXPECT_EQ(minVal, 256 * 6);
+}
+
+TEST_F(TestTiling, TestPowerTilingFactorSize)
+{
+    uint32_t maxLiveNodeCnt = 0xffff;
+    uint32_t extraBuf = 0xffff;
+    GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 5);
+    EXPECT_EQ(extraBuf, 256);
+    GetPowerTmpBufferFactorSize(false, true, true, 4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 7);
+    EXPECT_EQ(extraBuf, 0);
+    GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 14);
+    EXPECT_EQ(extraBuf, 256);
+    GetPowerTmpBufferFactorSize(true, true, false, 4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 256);
+    GetPowerTmpBufferFactorSize(true, true, true, 4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 6);
+    EXPECT_EQ(extraBuf, 0);
+    GetPowerTmpBufferFactorSize(true, true, false, 2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 14);
+    EXPECT_EQ(extraBuf, 256);
+}
+
+TEST_F(TestTiling, TestAcosTilingHalf16K)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto acosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 6 * 2);
+    EXPECT_EQ(minValue, 256 * 6);
+}
+
+TEST_F(TestTiling, TestAcosTilingFloat16K)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto acosShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestAsinhTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto asinhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAsinhMaxMinTmpSize(asinhShape, 4, true,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 3);
+    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
+
+    AscendC::GetAsinhMaxMinTmpSize(ge::Shape({32}), 4, true,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 3);
+    EXPECT_EQ(maxValue, 256 * 3);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetAsinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAsinhTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto asinhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAsinhMaxMinTmpSize(asinhShape, 2, true,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 3);
+    EXPECT_EQ(maxValue, 128 * 128 * 3 * 2);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetAsinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAcoshTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto acoshShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAcoshMaxMinTmpSize(acoshShape, 2, true,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetAcoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 2);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAcoshTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto acoshShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAcoshMaxMinTmpSize(acoshShape, 4, true,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetAcoshTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxis)
+{
+    const auto shape = ge::Shape({ 8, 128 });
+    const auto scalarShape = ge::Shape({1});
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisLargeShape)
+{
+    const auto shape = ge::Shape({ 128, 128 });
+    const auto scalarShape = ge::Shape({1});
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 4096 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 4096 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisSmallShape)
+{
+    const auto shape = ge::Shape({ 1, 16 });
+    const auto scalarShape = ge::Shape({1});
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024);
+    EXPECT_EQ(maxValue, 1024);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024);
+    EXPECT_EQ(maxValue, 1024);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxis)
+{
+    const auto srcShape = ge::Shape({ 8, 128 });
+    const auto scalarShape = ge::Shape({1});
+    const auto maskShape = ge::Shape({ 8, 160 });
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 8 * 2 + 512 + 8 * 128);
+    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512 + 8 * 128);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisLargeShape)
+{
+    const auto srcShape = ge::Shape({ 128, 128 });
+    const auto scalarShape = ge::Shape({1});
+    const auto maskShape = ge::Shape({ 128, 160 });
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 4096 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 4096 * 2 + 512 + 128 * 128);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512 + 128 * 128);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 4096 * 2 + 512);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
+}
+
+TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisSmallShape)
+{
+    const auto srcShape = ge::Shape({ 1, 16 });
+    const auto scalarShape = ge::Shape({1});
+    const auto maskShape = ge::Shape({ 1, 32 });
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024);
+    EXPECT_EQ(maxValue, 1024);
+    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024 + 32);
+    EXPECT_EQ(maxValue, 1024 + 32);
+    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024);
+    EXPECT_EQ(maxValue, 1024);
+}
+
+TEST_F(TestTiling, TestLayernormTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 4;
+
+    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
+    auto layernormShape = ge::Shape(shapeDims);
+    const bool isReuseSource = false;
+    optiling::LayerNormTiling tilling;
+
+    uint32_t minValue = 0;
+    uint32_t maxValue = 0;
+
+    AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue);
+    EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize);
+    EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize);
+
+    AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling);
+    EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
+}
+
+TEST_F(TestTiling, TestRmsnormTiling)
+{
+    constexpr uint32_t bLength = 4;
+    constexpr uint32_t sLength = 32;
+    constexpr uint32_t hLength = 16;
+    constexpr uint32_t bsLength = bLength * sLength;
+    constexpr uint32_t bshLength = bLength * sLength * hLength;
+    std::vector<int64_t> shapeDims = {bLength, sLength, hLength};
+    auto shape = ge::Shape(shapeDims);
+    constexpr uint32_t typeSize = 4;
+    constexpr uint32_t ONE_BLK_FLOAT = 8;
+
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    // common scene
+    bool res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue);
+    const uint32_t goldenMax = (bshLength + bsLength) * typeSize;
+    uint32_t goldenMin = (hLength + ONE_BLK_FLOAT) * typeSize;
+    EXPECT_EQ(res, true);
+    EXPECT_EQ(maxValue, goldenMax);
+    EXPECT_EQ(minValue, goldenMin);
+
+    // basic block scene 1: input shape is illegal, fail to get minSize
+    res = AscendC::GetRmsNormMaxMinTmpSize(shape, typeSize, maxValue, minValue, true);
+    EXPECT_EQ(res, false);
+
+    constexpr uint32_t BASIC_BLK_HLENGTH = 64;
+    constexpr uint32_t BASIC_BLK_BSLENGTH = 8;
+    shapeDims[2] = BASIC_BLK_HLENGTH;
+    auto shape_basic_blk = ge::Shape(shapeDims);// 4,32,64
+    // basic block scene 2: get minSize successfully
+    res = AscendC::GetRmsNormMaxMinTmpSize(shape_basic_blk, typeSize, maxValue, minValue, true);
+    goldenMin = (64 + 8) * typeSize;
+    EXPECT_EQ(res, true);
+    EXPECT_EQ(minValue, goldenMin);
+
+    // basic block scene: get basic block using minTmpSize
+    // goldenMin should be (BASIC_BLK_HLENGTH(64) * BASIC_BLK_BSLENGTH(8) + bsLength) * typeSize
+    optiling::RmsNormTiling tiling;
+    uint32_t tmpSize = (64 + 8) * 4; // shape: 4,32,64
+    res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, minValue, typeSize, tiling, true);
+    EXPECT_EQ(res, true);
+    EXPECT_EQ(tiling.get_mainBshLength(), 64);
+    EXPECT_EQ(tiling.get_mainBsLength(), 1);
+
+    auto shape1 = ge::Shape({1,7,16});
+    res = AscendC::GetRmsNormMaxMinTmpSize(shape1, typeSize, maxValue, minValue);
+    goldenMin = (8 + 16) * typeSize;
+    EXPECT_EQ(minValue, goldenMin);
+
+    uint32_t stackBufferSize = 100 * 1024;
+    // common scene: get tiling info successfully, shape: 1,7,16
+    res = AscendC::GetRmsNormTilingInfo(shape1, shape1, stackBufferSize, typeSize, tiling);
+    EXPECT_EQ(res, true);
+    EXPECT_EQ(tiling.get_mainBshLength(), 1*7*16);
+    EXPECT_EQ(tiling.get_mainBsLength(), 7);
+
+    stackBufferSize = hLength;
+    // common scene: fail to get tiling info because of small stack buffer
+    res = AscendC::GetRmsNormTilingInfo(shape, shape, stackBufferSize, typeSize, tiling);
+    EXPECT_EQ(res, false);
+
+    // basic block scene: get basic block tiling info successfully
+    stackBufferSize = 100 * 1024; // shape: 4,32,64
+    res = AscendC::GetRmsNormTilingInfo(shape_basic_blk, shape_basic_blk, stackBufferSize, typeSize, tiling, true);
     EXPECT_EQ(res, true);
     EXPECT_EQ(tiling.get_mainBshLength(), 4*32*64);
 
@@ -1232,6 +1933,39 @@ TEST_F(TestTiling, TestDeepnormTiling)
     EXPECT_EQ(tiling.get_oneTmpSize(), 512);
 }
 
+TEST_F(TestTiling, TestExpTiling)
+{
+    std::vector<int64_t> shapeDims = {128, 128};
+    auto expShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+
+    // float   isReuseSrc = false  3 tmpBuffer
+    AscendC::GetExpMaxMinTmpSize(expShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 3 * 256);
+    EXPECT_EQ(maxValue, 3 * 128 * 128 * 4);
+    // float   isReuseSrc = true   2 tmpBuffer
+    AscendC::GetExpMaxMinTmpSize(expShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 2 * 256);
+    EXPECT_EQ(maxValue, 2 * 128 * 128 * 4);
+    // half    4 tmpBuffer
+    AscendC::GetExpMaxMinTmpSize(expShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 4 * 256);
+    EXPECT_EQ(maxValue, 4 * 128 * 128 * 4);
+    AscendC::GetExpMaxMinTmpSize(expShape, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 4 * 256);
+    EXPECT_EQ(maxValue, 4 * 128 * 128 * 4);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetExpTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+    AscendC::GetExpTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 8);
+    EXPECT_EQ(extraBuf, 0);
+}
+
 TEST_F(TestTiling, TestMatmulApiTilngFactorSplit1)
 {
     MatmulApiTiling tiling;
@@ -1573,6 +2307,7 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM4)
     EXPECT_EQ(res, 0);
 }
 
+
 TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadCase)
 {
     optiling::TCubeTiling tilingData;
@@ -1614,13 +2349,66 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM5)
     EXPECT_EQ(res, 0);
 }
 
-TEST_F(TestTiling, TestLayernormGradTiling)
+TEST_F(TestTiling, TestConcatTiling)
 {
-    const uint32_t stackBufferSize = 100 * 1024;
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    const uint32_t elemCount = 128;
 
-    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
-    auto layernormgradShape = ge::Shape(shapeDims);
-    optiling::LayerNormGradTiling tiling;
+    AscendC::GetConcatTmpSize(plat, elemCount, 2);
+}
+
+TEST_F(TestTiling, TestSortTiling)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    const uint32_t elemCount = 128;
+
+    AscendC::GetSortTmpSize(plat, elemCount, 4);
+}
+
+TEST_F(TestTiling, TestUnPadTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 4;
+
+    std::vector<int64_t> shapeDims = { 32, 32 };
+    auto srcShape = ge::Shape(shapeDims);
+    optiling::UnPadTiling tiling;
+
+    AscendC::UnPadTilingFunc(srcShape, 0, typeSize, tiling);
+    AscendC::UnPadTilingFunc(srcShape, stackBufferSize, typeSize, tiling);
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetUnPadMaxMinTmpSize(plat, srcShape, typeSize, maxValue, minValue);
+}
+
+TEST_F(TestTiling, TestPadTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 4;
+
+    std::vector<int64_t> shapeDims = { 32, 32};
+    std::vector<int64_t> ori_shape_dims = { 32, 31 };
+    auto srcShape = ge::Shape(shapeDims);
+    auto oriSrcShape = ge::Shape(ori_shape_dims);
+    optiling::PadTiling tiling;
+
+    AscendC::PadTilingFunc(srcShape, oriSrcShape, stackBufferSize, typeSize, tiling);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetPadMaxMinTmpSize(srcShape, typeSize, maxValue, minValue);
+}
+
+TEST_F(TestTiling, TestLayernormGradTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+
+    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
+    auto layernormgradShape = ge::Shape(shapeDims);
+    optiling::LayerNormGradTiling tiling;
 
     AscendC::GetLayerNormGradNDTilingInfo(layernormgradShape, stackBufferSize, 4, false, tiling);
     EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize);
@@ -1665,6 +2453,23 @@ TEST_F(TestTiling, TestLayernormGradBetaTiling)
     EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize / sizeof(float));
 }
 
+TEST_F(TestTiling, TestConfusionTransposeTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 2;
+
+    std::vector<int64_t> shapeDims = { 1, 2, 64, 32 };
+    auto srcShape = ge::Shape(shapeDims);
+    optiling::ConfusionTransposeTiling tiling;
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 1, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 2, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 3, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 4, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 5, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 6, tiling);
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 7, tiling);
+}
+
 TEST_F(TestTiling, TestMatmulApiTilngL0BNoDB)
 {
     MatmulApiTiling tiling;
@@ -2001,6 +2806,129 @@ TEST_F(TestTiling, TestMatmulApiTilngSetShapeZero)
     EXPECT_EQ(ret, -1);
 }
 
+// #if __CCE_AICORE__ == 200
+// TEST_F(TestTiling, TestPlatformAscendC)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+//     EXPECT_EQ(plat.GetCoreNumVector(), 8);
+//     EXPECT_EQ(plat.GetCoreNumVector() + plat.GetCoreNumAic() , 18);
+// }
+// #endif
+
+// #if __CCE_AICORE__ == 220
+// extern void platfrom_stub_set_num_aic(const char *num);
+// extern void platfrom_stub_set_num_aiv(const char *num);
+// extern void platfrom_stub_set_num_cub(const char *num);
+// extern void platfrom_stub_set_ctl(const char *num);
+// extern void platfrom_stub_set_chip_version(const char *num);
+// extern void platfrom_stub_set_num(uint32_t num);
+// TEST_F(TestTiling, TestPlatformAscendC)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+//     uint64_t ub_size, l1_size, l0;
+//     uint64_t l2_bw, hbm_bw, bw;
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size);
+//     EXPECT_EQ(ub_size, 196352);
+//     EXPECT_EQ(l1_size, 524032);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0);
+//     EXPECT_EQ(l0, 65536);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0);
+//     EXPECT_EQ(l0, 65536);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0);
+//     EXPECT_EQ(l0, 65536 * 2);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw);
+//     EXPECT_EQ(l2_bw, 110);
+//     EXPECT_EQ(hbm_bw, 32);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw);
+//     EXPECT_EQ(plat.GetCoreNum(), 48);
+//     EXPECT_EQ(plat.GetCoreNumAic(), 24);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 48);
+//     platfrom_stub_set_num_cub("20");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 20);
+//     platfrom_stub_set_num_aiv("40");
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 40);
+//     platfrom_stub_set_ctl("AICore");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 24);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 24);
+//     platfrom_stub_set_num_aic("20");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 20);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 20);
+//     EXPECT_EQ(bw, 0);
+//     EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2);
+
+//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 16 * 1024 * 1024);
+//     platfrom_stub_set_chip_version("Ascend910");
+//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2 * 1024 * 1024);
+//     EXPECT_EQ(plat.GetSocVersion(), platform_ascendc::SocVersion::ASCEND910);
+//     EXPECT_EQ(plat.GetCoreNumVector(), 0);
+// }
+// #endif
+
+// #if __CCE_AICORE__ == 300
+// extern void platfrom_stub_set_num_aic(const char *num);
+// extern void platfrom_stub_set_num_aiv(const char *num);
+// extern void platfrom_stub_set_num_cub(const char *num);
+// extern void platfrom_stub_set_ctl(const char *num);
+// extern void platfrom_stub_set_chip_version(const char *num);
+// extern void platfrom_stub_set_num(uint32_t num);
+// TEST_F(TestTiling, TestPlatformAscendC)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+//     uint64_t ub_size, l1_size, l0;
+//     uint64_t l2_bw, hbm_bw, bw;
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size);
+//     EXPECT_EQ(ub_size, 248 * 1024);
+//     EXPECT_EQ(l1_size, 1024 * 1024);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0);
+//     EXPECT_EQ(l0, 65536);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0);
+//     EXPECT_EQ(l0, 65536);
+//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0);
+//     EXPECT_EQ(l0, 65536 * 2);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw);
+//     EXPECT_EQ(l2_bw, 256);
+//     EXPECT_EQ(hbm_bw, 17);
+//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw);
+//     EXPECT_EQ(plat.GetCoreNum(), 1);
+//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
+//     platfrom_stub_set_num_cub("1");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
+//     platfrom_stub_set_num_aiv("1");
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
+//     platfrom_stub_set_ctl("AICore");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
+//     platfrom_stub_set_num_aic("2");
+//     EXPECT_EQ(plat.GetCoreNumAic(), 2);
+//     EXPECT_EQ(plat.GetCoreNumAiv(), 2);
+//     EXPECT_EQ(bw, 0);
+//     EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1);
+//     EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2);
+//     EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2);
+
+//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2097152);
+//     EXPECT_EQ(plat.GetCoreNumVector(), 0);
+// }
+// #endif
+
 TEST_F(TestTiling, TestMatmulApiTilngInt8Case1)
 {
     MatmulApiTiling tiling;
@@ -2279,6 +3207,60 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case9)
     EXPECT_EQ(ret, 0);
 }
 
+TEST_F(TestTiling, TestErfTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto erfShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetErfMaxMinTmpSize(erfShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
+    EXPECT_EQ(minValue, 256 * 3);
+}
+
+TEST_F(TestTiling, TestErfTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto erfShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetErfMaxMinTmpSize(erfShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 8);
+    EXPECT_EQ(minValue, 256 * 8);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetErfTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 8);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestErfcTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto erfcShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetErfcMaxMinTmpSize(erfcShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 7 * 4);
+    EXPECT_EQ(minValue, 256 * 7);
+}
+
+TEST_F(TestTiling, TestErfcTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto erfcShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetErfcMaxMinTmpSize(erfcShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 16);
+    EXPECT_EQ(minValue, 256 * 16);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetErfcTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 16);
+    EXPECT_EQ(extraBuf, 0);
+}
+
 TEST_F(TestTiling, TestMatmulApiTilngInt8Case10)
 {
     MultiCoreMatmulTiling tiling;
@@ -2357,6 +3339,95 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case13)
     EXPECT_EQ(ret, 0);
 }
 
+TEST_F(TestTiling, TestCoshTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto coshShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestCoshTilingFloat512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto coshShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 512 * 4 * 2);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestCoshTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto coshShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCoshMaxMinTmpSize(coshShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 6);
+    EXPECT_EQ(minValue, 256 * 6);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetCoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 6);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestSinTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto sinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetSinMaxMinTmpSize(sinShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 2 * 256);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
+    GetSinMaxMinTmpSize(sinShape, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 3 * 256);
+    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetSinTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestSinTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto sinShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetSinMaxMinTmpSize(sinShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 8 * 2);
+    EXPECT_EQ(minValue, 8 * 256);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetSinTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 8);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAscendSumTiling)
+{
+    uint32_t n = 8;
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetSumMaxMinTmpSize(n, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 32);
+    EXPECT_EQ(maxValue, 32);
+
+    maxValue = 0;
+    minValue = 0;
+    GetSumMaxMinTmpSize(n, 4, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 32);
+    EXPECT_EQ(maxValue, 32);
+}
+
 TEST_F(TestTiling, TestAscendSiluTiling)
 {
     std::vector<int64_t> shapeDims = { 512 };
@@ -2379,6 +3450,54 @@ TEST_F(TestTiling, TestAscendSwishTiling)
     EXPECT_EQ(maxValue, 0);
 }
 
+TEST_F(TestTiling, TestAscendXorTiling)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto xorShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetXorMaxMinTmpSize(xorShape, 2, true, maxValue, minValue);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 2);
+    EXPECT_EQ(minValue, 1 * 256);
+    GetXorTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestFracTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto fracShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFracMaxMinTmpSize(fracShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 0);
+    EXPECT_EQ(maxValue, 0);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetFracTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 0);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestFracTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto fracShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFracMaxMinTmpSize(fracShape, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 1024);
+    EXPECT_EQ(maxValue, 131072);
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetFracTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 0);
+}
+
 #if __CCE_AICORE__ == 220
 extern void platfrom_stub_set_chip_version(const char *num);
 TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64)
@@ -2861,6 +3980,15 @@ TEST_F(TestTiling, TestTopkTiling_TopKModeSmall310P_HALF)
 }
 #endif
 
+TEST_F(TestTiling, TestArithProgression)
+{
+    uint32_t maxValue;
+    uint32_t minValue;
+    GetArithProgressionMaxMinTmpSize(maxValue, minValue);
+    EXPECT_EQ(maxValue, 0);
+    EXPECT_EQ(minValue, 0);
+}
+
 TEST_F(TestTiling, TestGeGLUTilingFloat)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
@@ -2895,6 +4023,566 @@ TEST_F(TestTiling, TestGeGLUTilingHalf)
     EXPECT_EQ(extraBuf, 0);
 }
 
+TEST_F(TestTiling, TestLgammaTilingFp32)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto shape = ge::Shape(shapeDims);
+    uint32_t maxSize;
+    uint32_t minSize;
+    GetLgammaMaxMinTmpSize(shape, 4, true, maxSize, minSize);
+    EXPECT_EQ(maxSize, 458752);
+    EXPECT_EQ(minSize, 1792);
+
+    GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 524288);
+    EXPECT_EQ(minSize, 2048);
+
+    shapeDims = { 8 };
+    shape = ge::Shape(shapeDims);
+    GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 2048);
+    EXPECT_EQ(minSize, 2048);
+
+    GetLgammaMaxMinTmpSize(shape, 4, true,maxSize, minSize);
+    EXPECT_EQ(maxSize, 1792);
+    EXPECT_EQ(minSize, 1792);
+
+    uint32_t maxLiveNodeCnt = 0xffff;
+    uint32_t extraBuf = 0xffff;
+    GetLgammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 8);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestLgammaTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto shape = ge::Shape(shapeDims);
+    uint32_t maxSize;
+    uint32_t minSize;
+
+    GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 128 * 128 * 2 * 13 * 2);
+    EXPECT_EQ(minSize, 13 * 2 * 256);
+
+    shapeDims = { 8 };
+    shape = ge::Shape(shapeDims);
+    GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 256 * 13 * 2);
+    EXPECT_EQ(minSize, 256 * 13 * 2);
+
+    uint32_t maxLiveNodeCnt = 0xffff;
+    uint32_t extraBuf = 0xffff;
+    GetLgammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 13);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestDigammaTilingFp32)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto shape = ge::Shape(shapeDims);
+    uint32_t maxSize;
+    uint32_t minSize;
+    GetDigammaMaxMinTmpSize(shape, 4, true, maxSize, minSize);
+    EXPECT_EQ(maxSize, 393216);
+    EXPECT_EQ(minSize, 1536);
+
+    GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 458752);
+    EXPECT_EQ(minSize, 1792);
+
+    shapeDims = { 8 };
+    shape = ge::Shape(shapeDims);
+    GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 1792);
+    EXPECT_EQ(minSize, 1792);
+
+    GetDigammaMaxMinTmpSize(shape, 4, true,maxSize, minSize);
+    EXPECT_EQ(maxSize, 1536);
+    EXPECT_EQ(minSize, 1536);
+
+    uint32_t maxLiveNodeCnt = 0xffff;
+    uint32_t extraBuf = 0xffff;
+    GetDigammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 7);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestDigammaTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto shape = ge::Shape(shapeDims);
+    uint32_t maxSize;
+    uint32_t minSize;
+
+    GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 128 * 128 * 2 * 8 * 2);
+    EXPECT_EQ(minSize, 8 * 2 * 256);
+
+    shapeDims = { 8 };
+    shape = ge::Shape(shapeDims);
+    GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 256 * 8 * 2);
+    EXPECT_EQ(minSize, 256 * 8 * 2);
+
+    uint32_t maxLiveNodeCnt = 0xffff;
+    uint32_t extraBuf = 0xffff;
+    GetDigammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 16);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAtanhTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto aTanhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 4 * 1);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetAtanhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAtanhTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto aTanhShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 4);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    AscendC::GetAtanhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 4);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestSignTiling)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto signShape = ge::Shape(shapeDims);
+    uint32_t signNeedMaxSize;
+    uint32_t signNeedMinSize;
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetSignMaxMinTmpSize(signShape, 2, false, signNeedMaxSize, signNeedMinSize);
+    EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 2);
+    EXPECT_EQ(signNeedMinSize, 3 * 256);
+
+    GetSignMaxMinTmpSize(signShape, 4, false, signNeedMaxSize, signNeedMinSize);
+    EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 4);
+    EXPECT_EQ(signNeedMinSize, 3 * 256);
+
+    GetSignTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 3);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestAscendMeanTiling)
+{
+    uint32_t n = 8;
+    uint32_t maxValue;
+    uint32_t minValue;
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+
+    GetMeanMaxMinTmpSize(n, 2, 2, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 32);
+    EXPECT_EQ(maxValue, 32);
+
+    maxValue = 0;
+    minValue = 0;
+    GetMeanMaxMinTmpSize(n, 4, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 32);
+    EXPECT_EQ(maxValue, 32);
+
+    GetMeanMaxMinTmpSize(n, 2, 4, true, maxValue, minValue);
+    EXPECT_EQ(minValue, 96);
+    EXPECT_EQ(maxValue, 96);
+
+    GetMeanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+// TEST_F(TestTiling, TestKernelContextBuildBase)
+// {
+//     auto builder = context_ascendc::BuildKernelRunContext(2, 2);
+//     EXPECT_EQ(builder.kernelInputNum, 2);
+// }
+
+
+// TEST_F(TestTiling, TestKernelContextBuild)
+// {
+//     gert::Shape input1_shape = {2, 1, 1, 1, 1, 1, 1, 2, 2};
+//     int32_t input1_tensor_buffer[] = {0, 2, 3, 3, 1, 0, 0, 1};
+//     gert::TensorData input1_tensor_data{(void*)input1_tensor_buffer, nullptr};
+//     gert::Shape output_shape = {5, 3};
+//     int64_t output_tensor_buffer[15];
+//     gert::TensorData output_tensor_data{(void*)output_tensor_buffer, nullptr};
+//     auto kernelHolder =
+//         context_ascendc::KernelRunContextBuilder()
+//             .KernelIONum(2, 2)
+//             .Inputs({reinterpret_cast<void*>(&input1_shape),
+//                     reinterpret_cast<void*>(&input1_tensor_data)})
+//             .Outputs({reinterpret_cast<void*>(&output_shape), reinterpret_cast<void*>(&output_tensor_data)})
+//             .NodeIoNum(1, 1)
+//             .IrInputNum(1)
+//             .NodeInputTd(0, ge::DT_INT32, ge::FORMAT_ND, ge::FORMAT_ND)
+//             .NodeOutputTd(0, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND)
+//             .Build();
+//     auto context = kernelHolder.GetContext<gert::KernelContext>();
+//     EXPECT_NE(context, nullptr);
+// }
+
+// TEST_F(TestTiling, TestTilingContextBuildWithConstValue)
+// {
+//     string active_type = "gelu";
+//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
+//     gert::StorageShape expert_tokens_shape = {{16}, {16}};
+//     gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}};
+//     gert::StorageShape bias1_shape = {{16, 0}, {16, 0}};
+//     gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}};
+//     gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}};
+
+//     gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}};
+
+//     std::vector<int64_t> expert_tokens_const_value (16, 1);
+//     std::vector<float> x_const_value (1024 * 5120, 2.f);
+//     std::vector<float> bias2_value (16 * 5120, 3.f);
+//     auto param = gert::TilingData::CreateCap(4096);
+//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
+//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
+//     auto holder = context_ascendc::TilingContextBuilder()
+//                                 .SetOpNameType("name", "tpye")
+//                                 .NodeIoNum(6, 1)
+//                                 .IrInstanceNum({1, 1, 1, 1, 1, 1})
+//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
+//                                 .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape)
+//                                 .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape)
+//                                 .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, reinterpret_cast<void *>(expert_tokens_const_value.data()))
+//                                 .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape)
+//                                 .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast<void*>(bias2_value.data()))
+//                                 .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape)
+//                                 .AddAttrs({
+//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
+//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
+//                                 })
+//                                 .TilingData(param.get())
+//                                 .Workspace(ws_size)
+//                                 .Build();
+
+//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
+//     EXPECT_NE(tiling_context, nullptr);
+
+// }
+
+// TEST_F(TestTiling, TestTilingContextBuildAddInputs)
+// {
+//     string active_type = "gelu";
+//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
+//     std::vector<context_ascendc::TensorInfo> inputs;
+//     std::vector<context_ascendc::TensorInfo> outputs;
+//     context_ascendc::TensorInfo input;
+//     input.shape = x_shape;
+//     input.dType = ge::DT_FLOAT16;
+//     input.oriFormat = ge::FORMAT_ND;
+//     input.format = ge::FORMAT_ND;
+//     input.dataPath = "1111";
+//     inputs.push_back(input);
+//     context_ascendc::TensorInfo output;
+//     output.shape = x_shape;
+//     output.dType = ge::DT_FLOAT16;
+//     output.oriFormat = ge::FORMAT_ND;
+//     output.format = ge::FORMAT_ND;
+//     output.dataPath = "222";
+//     outputs.push_back(output);
+
+//     auto param = gert::TilingData::CreateCap(4096);
+//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
+//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
+//     auto holder = context_ascendc::TilingContextBuilder()
+//                                 .SetOpNameType("name", "tpye")
+//                                 .NodeIoNum(1, 1)
+//                                 .IrInstanceNum({1})
+//                                 .AddInputs(inputs)
+//                                 .AddOutputs(outputs)
+//                                 .AddAttrs({
+//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
+//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
+//                                 })
+//                                 .TilingData(param.get())
+//                                 .Workspace(ws_size)
+//                                 .Build();
+
+//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
+//     EXPECT_NE(tiling_context, nullptr);
+// }
+
+// TEST_F(TestTiling, TestTilingContextBuildFailed)
+// {
+//     string active_type = "gelu";
+//     gert::StorageShape x_shape = {{-1, 5120}, {-1, 5120}};
+//     std::vector<float> x_const_value (1024 * 5120, 2.f);
+//     auto param = gert::TilingData::CreateCap(4096);
+//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
+//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
+//     auto holder = context_ascendc::TilingContextBuilder()
+//                                 .NodeIoNum(1, 1)
+//                                 .IrInstanceNum({1, 1})
+//                                 .CompileInfo(nullptr)
+//                                 .PlatformInfo(nullptr)
+//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
+//                                 .Workspace(ws_size)
+//                                 .Build();
+
+//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
+//     EXPECT_EQ(tiling_context, nullptr);
+// }
+
+// TEST_F(TestTiling, TestTilingContextBuildWithBinFile)
+// {
+//     string active_type = "gelu";
+//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
+//     gert::StorageShape expert_tokens_shape = {{16}, {16}};
+//     gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}};
+//     gert::StorageShape bias1_shape = {{16, 0}, {16, 0}};
+//     gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}};
+//     gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}};
+//     gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}};
+
+//     std::vector<int64_t> expert_tokens_const_value (16, 1);
+
+//     std::vector<float> x_const_value (1024 * 5120, 2.f);
+//     std::vector<float> bias2_value (16 * 5120, 3.f);
+//     auto param = gert::TilingData::CreateCap(4096);
+//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
+//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
+//     auto holder = context_ascendc::TilingContextBuilder()
+//                                 .SetOpNameType("name", "tpye")
+//                                 .NodeIoNum(6, 1)
+//                                 .IrInstanceNum({1, 1, 1, 1, 1, 1})
+//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
+//                                 .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape)
+//                                 .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape)
+//                                 .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, "./expert_tokens_data.bin")
+//                                 .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape)
+//                                 .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast<void*>(bias2_value.data()))
+//                                 .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape)
+//                                 .AddAttrs({
+//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
+//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
+//                                 })
+//                                 .TilingData(param.get())
+//                                 .Workspace(ws_size)
+//                                 .Build();
+//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
+//     EXPECT_EQ(tiling_context, nullptr);
+// }
+
+TEST_F(TestTiling, TestAxpyTiling)
+{
+    uint32_t maxVal = 0;
+    uint32_t minVal = 0;
+    GetAxpyMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 0);
+    EXPECT_EQ(minVal, 0);
+    GetAxpyMaxMinTmpSize(ge::Shape({256}), 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 4 * 2);
+    EXPECT_EQ(minVal, 256 * 4);
+    GetAxpyMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal);
+    EXPECT_EQ(maxVal, 256 * 4);
+    EXPECT_EQ(minVal, 256 * 4);
+    uint32_t extraBuf = 123;
+    uint32_t maxLivedNodesCnt = 123;
+    GetAxpyTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf);
+    EXPECT_EQ(extraBuf, 0);
+    EXPECT_EQ(maxLivedNodesCnt, 1);
+    GetAxpyTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf);
+    EXPECT_EQ(extraBuf, 0);
+    EXPECT_EQ(maxLivedNodesCnt, 4);
+}
+
+TEST_F(TestTiling, TestCeilTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto ceilShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCeilMaxMinTmpSize(ceilShape, sizeof(float), false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 1);
+    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetCeilTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 1);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestCeilTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto ceilShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCeilMaxMinTmpSize(ceilShape, 2, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetCeilTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 2);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestCeilTilingHalf512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto ceilShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 512 * 2 * 2);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+TEST_F(TestTiling, TestFloorTilingFloat)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto floorShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFloorMaxMinTmpSize(floorShape, sizeof(float), false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 0);
+    EXPECT_EQ(maxValue, 0);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetFloorTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 0);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestFloorTilingHalf)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto floorShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFloorMaxMinTmpSize(floorShape, 2, false,  maxValue, minValue);
+    EXPECT_EQ(minValue, 256 * 2);
+    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
+
+    uint32_t maxLiveNodeCnt;
+    uint32_t extraBuf;
+    GetFloorTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
+    EXPECT_EQ(maxLiveNodeCnt, 2);
+    EXPECT_EQ(extraBuf, 0);
+}
+
+TEST_F(TestTiling, TestFloorTilingHalf512)
+{
+    std::vector<int64_t> shapeDims = { 512 };
+    auto floorShape = ge::Shape(shapeDims);
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue);
+    EXPECT_EQ(maxValue, 512 * 2 * 2);
+    EXPECT_EQ(minValue, 256 * 2);
+}
+
+// TEST_F(TestTiling, TestGetSocVersion)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+
+//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
+//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
+//         .stubs()
+//         .will(returnValue(false));
+
+//     platform_ascendc::SocVersion ret = plat.GetSocVersion();
+//     EXPECT_EQ(ret, platform_ascendc::SocVersion::RESERVED_VERSION);
+// }
+
+// TEST_F(TestTiling, TestCoreNum)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+
+//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
+//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
+//         .stubs()
+//         .will(returnValue(false));
+
+//     uint32_t ret1 = plat.GetCoreNumAic();
+//     uint32_t ret2 = plat.GetCoreNumAiv();
+//     EXPECT_EQ(ret1, 0);
+//     EXPECT_EQ(ret2, 0);
+// }
+
+// TEST_F(TestTiling, TestGetLibApiWorkSpaceSize)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+
+//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
+//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
+//         .stubs()
+//         .will(returnValue(false));
+
+//     uint32_t ret1 = plat.GetLibApiWorkSpaceSize();
+//     EXPECT_EQ(ret1, static_cast<uint32_t>(-1));
+// }
+// TEST_F(TestTiling, TestPlatformAscendCManager)
+// {
+//     void *handle;
+//     int a = 7;
+//     handle = &a;
+
+//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
+//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
+//         .stubs()
+//         .will(returnValue(false));
+
+//     auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance();
+// }
+
+// TEST_F(TestTiling, TestGetVectorCoreNum)
+// {
+//     fe::PlatFormInfos platform_info;
+//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+
+//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
+//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
+//         .stubs()
+//         .will(returnValue(false));
+//     MOCKER_CPP(&platform_ascendc::PlatformAscendC::GetSocVersion,
+//         platform_ascendc::SocVersion(platform_ascendc::PlatformAscendC::*)(void) const)
+//         .stubs()
+//         .will(returnValue(platform_ascendc::SocVersion::ASCEND310P));
+
+//     uint32_t ret1 = plat.GetCoreNumVector();
+//     EXPECT_EQ(ret1, static_cast<uint32_t>(0));
+//     MOCKER_CPP(&platform_ascendc::PlatformAscendCManager::PlatformAscendCInit)
+//         .stubs()
+//         .will(returnValue(platform_info));
+//     auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance();
+
+// }
+
 TEST_F(TestTiling, TestReGluFloat16OrBf16)
 {
     const std::vector<int64_t> srcShapeDims = { 8, 128 };
@@ -2915,4 +4603,234 @@ TEST_F(TestTiling, TestReGluFloat32)
     GetReGluMaxMinTmpSize(srcShape, 4, false, maxValue, minValue);
     EXPECT_EQ(minValue, 256);
     EXPECT_EQ(maxValue, 256);
+}
+
+#if __CCE_AICORE__ == 220
+extern void platfrom_stub_set_chip_version(const char *num);
+TEST_F(TestTiling, TestBroadCast220)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    platfrom_stub_set_chip_version("Ascend910B");
+    uint32_t firstDim = 32;
+    uint32_t lastDim = 32;
+    std::vector<int64_t> srcShapeDims = {firstDim, 1};
+    auto srcShape = ge::Shape(srcShapeDims);
+    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
+    auto dstShape = ge::Shape(dstShapeDims);
+    uint32_t maxValue{0};
+    uint32_t minValue{0};
+    constexpr uint32_t halfSize = 2;
+    constexpr uint32_t halfOneBlockElementNum = 16;
+    constexpr uint32_t minHalfAlignSize = halfOneBlockElementNum * halfOneBlockElementNum * halfSize;
+    constexpr uint32_t BRCB_ONE_SIZE = 8;
+    uint32_t firstDimAlignNum = (firstDim + BRCB_ONE_SIZE - 1) / BRCB_ONE_SIZE * BRCB_ONE_SIZE;
+    uint32_t maxHalfAlignSize = firstDimAlignNum * halfOneBlockElementNum * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfAlignSize);
+    EXPECT_EQ(maxValue, maxHalfAlignSize);
+
+    srcShapeDims = {firstDim, 1};
+    srcShape = ge::Shape(srcShapeDims);
+    uint32_t lastDimNotAlign = 31;
+    dstShapeDims = {firstDim, lastDimNotAlign};
+    dstShape = ge::Shape(dstShapeDims);
+
+    uint32_t blockDimAlignBlockNum = (lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum;
+    uint32_t blockDimAlign = blockDimAlignBlockNum * halfOneBlockElementNum;
+    uint32_t minCopyTempBufferSize = halfOneBlockElementNum * blockDimAlign * halfSize;
+    auto minHalfNotAlignSize = minHalfAlignSize + minCopyTempBufferSize;
+
+    uint32_t maxCopyTempBufferSize = firstDim * blockDimAlign * halfSize;
+    uint32_t maxHalfNotAlignValue = maxHalfAlignSize + maxCopyTempBufferSize;
+
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfNotAlignSize);
+    EXPECT_EQ(maxValue, maxHalfNotAlignValue);
+
+    constexpr uint32_t int8Size = 1;
+    srcShapeDims = {firstDim, 1};
+    srcShape = ge::Shape(srcShapeDims);
+    dstShapeDims = {firstDim, lastDim};
+    dstShape = ge::Shape(dstShapeDims);
+    const uint32_t alignSrcSize =
+        ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
+    uint32_t alignDstSize =
+        ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
+    uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize);
+    EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize);
+
+    srcShapeDims = {firstDim, 1};
+    srcShape = ge::Shape(srcShapeDims);
+    dstShapeDims = {firstDim, lastDimNotAlign};
+    dstShape = ge::Shape(dstShapeDims);
+    alignDstSize =
+        ((firstDim * lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
+    castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfNotAlignSize + castTempBufferSize);
+    EXPECT_EQ(maxValue, maxHalfNotAlignValue + castTempBufferSize);
+}
+#endif
+
+#if __CCE_AICORE__ == 200
+extern void platfrom_stub_set_chip_version(const char *num);
+TEST_F(TestTiling, TestLastBroadCast200)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    platfrom_stub_set_chip_version("Ascend310P");
+    uint32_t firstDim = 32;
+    uint32_t lastDim = 32;
+    std::vector<int64_t> srcShapeDims = {firstDim, 1};
+    auto srcShape = ge::Shape(srcShapeDims);
+    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
+    auto dstShape = ge::Shape(dstShapeDims);
+    uint32_t maxValue{0};
+    uint32_t minValue{0};
+    constexpr uint32_t halfSize = 2;
+    constexpr uint32_t halfOneBlockElementNum = 16;
+    constexpr uint32_t MAX_BLOCK_NUM = 8;
+    constexpr uint32_t ONE_BLOCK_SIZE = 32;
+    uint32_t minTmpBufferSize =
+            halfOneBlockElementNum * ((lastDim + MAX_BLOCK_NUM - 1) / MAX_BLOCK_NUM) * halfSize;
+    uint32_t minHalfAlignSize = ONE_BLOCK_SIZE + + minTmpBufferSize;
+    uint32_t maxHalfAlignSize = ONE_BLOCK_SIZE + firstDim * lastDim * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfAlignSize);
+    EXPECT_EQ(maxValue, maxHalfAlignSize);
+
+    constexpr uint32_t int8Size = 1;
+    const uint32_t alignSrcSize =
+        ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
+    const uint32_t alignDstSize =
+        ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
+    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize);
+    EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize);
+}
+
+TEST_F(TestTiling, TestFirstBroadCast200)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    platfrom_stub_set_chip_version("Ascend310P");
+    uint32_t firstDim = 32;
+    uint32_t lastDim = 32;
+    std::vector<int64_t> srcShapeDims = {1, lastDim};
+    auto srcShape = ge::Shape(srcShapeDims);
+    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
+    auto dstShape = ge::Shape(dstShapeDims);
+    uint32_t maxValue{0};
+    uint32_t minValue{0};
+    constexpr uint32_t halfSize = 2;
+    constexpr uint32_t ONE_BLOCK_SIZE = 32;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
+    EXPECT_EQ(minValue, ONE_BLOCK_SIZE);
+    EXPECT_EQ(maxValue, ONE_BLOCK_SIZE);
+
+    constexpr uint32_t int8Size = 1;
+    constexpr uint32_t HALF_ONE_BLK_SIZE = 16;
+    const uint32_t alignSrcSize = ((lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
+    const uint32_t alignDstSize =
+        ((firstDim * lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
+    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
+    EXPECT_EQ(minValue, ONE_BLOCK_SIZE + castTempBufferSize);
+    EXPECT_EQ(maxValue, ONE_BLOCK_SIZE + castTempBufferSize);
+}
+
+TEST_F(TestTiling, TestOneElementBroadCast200)
+{
+    fe::PlatFormInfos platform_info;
+    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
+    platfrom_stub_set_chip_version("Ascend310P");
+    uint32_t srcDim = 1;
+    uint32_t dstDim = 32;
+    std::vector<int64_t> srcShapeDims = {srcDim};
+    auto srcShape = ge::Shape(srcShapeDims);
+    std::vector<int64_t> dstShapeDims = {dstDim};
+    auto dstShape = ge::Shape(dstShapeDims);
+    uint32_t maxValue{0};
+    uint32_t minValue{0};
+    constexpr uint32_t halfSize = 2;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 0);
+    EXPECT_EQ(maxValue, 0);
+
+    constexpr uint32_t int8Size = 1;
+    constexpr uint32_t HALF_ONE_BLK_SIZE = 16;
+    constexpr uint32_t ONE_BLOCK_SIZE = 32;
+    const uint32_t alignSrcSize = ((srcDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
+    const uint32_t alignDstSize = ((dstDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
+    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
+    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
+    EXPECT_EQ(minValue, castTempBufferSize + ONE_BLOCK_SIZE);
+    EXPECT_EQ(maxValue, castTempBufferSize + ONE_BLOCK_SIZE);
+}
+#endif
+
+TEST_F(TestTiling, TestReduceXorSumTilingInt16)
+{
+    std::vector<int64_t> shapeDims = { 128, 128 };
+    auto shape = ge::Shape(shapeDims);
+    uint32_t maxSize;
+    uint32_t minSize;
+    GetReduceXorSumMaxMinTmpSize(shape, 2, true, maxSize, minSize);
+    EXPECT_EQ(maxSize, 65536);
+    EXPECT_EQ(minSize, 65536);
+
+    GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 98304);
+    EXPECT_EQ(minSize, 98304);
+
+    shapeDims = { 8 };
+    shape = ge::Shape(shapeDims);
+    GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize);
+    EXPECT_EQ(maxSize, 768);
+    EXPECT_EQ(minSize, 768);
+
+    GetReduceXorSumMaxMinTmpSize(shape, 2, true,maxSize, minSize);
+    EXPECT_EQ(maxSize, 512);
+    EXPECT_EQ(minSize, 512);
+}
+
+TEST_F(TestTiling, TestCumSum)
+{
+    uint32_t firstDim = 32;
+    uint32_t lastDim = 16;
+    std::vector<int64_t> srcShapeDims = {firstDim, lastDim};
+    auto srcShape = ge::Shape(srcShapeDims);
+    uint32_t maxValue{0};
+    uint32_t minValue{0};
+    constexpr uint32_t halfSize = 2;
+    constexpr uint32_t transDataTo5HDAddrListSize = 16;
+    uint32_t minHalfSize = transDataTo5HDAddrListSize * lastDim * 3 * sizeof(uint16_t);
+    uint32_t alignOutter = (firstDim + transDataTo5HDAddrListSize - 1) / transDataTo5HDAddrListSize * transDataTo5HDAddrListSize;
+    uint32_t maxHalfSize = alignOutter * lastDim * 3 * sizeof(uint16_t);
+
+    GetCumSumMaxMinTmpSize(srcShape, halfSize, true, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfSize);
+    EXPECT_EQ(maxValue, maxHalfSize);
+
+    constexpr uint32_t floatSize = 4;
+    uint32_t minFloatSize = transDataTo5HDAddrListSize * lastDim * 2 * sizeof(float);
+    uint32_t maxFloatSize = alignOutter * lastDim * 2 * sizeof(float);
+
+    GetCumSumMaxMinTmpSize(srcShape, floatSize, true, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minFloatSize);
+    EXPECT_EQ(maxValue, maxFloatSize);
+
+    maxHalfSize = minHalfSize = firstDim * lastDim * sizeof(float);
+    GetCumSumMaxMinTmpSize(srcShape, halfSize, false, false, maxValue, minValue);
+    EXPECT_EQ(minValue, minHalfSize);
+    EXPECT_EQ(maxValue, maxHalfSize);
+
+
+    GetCumSumMaxMinTmpSize(srcShape, floatSize, false, false, maxValue, minValue);
+    EXPECT_EQ(minValue, 0);
+    EXPECT_EQ(maxValue, 0);
 }
\ No newline at end of file
-- 
Gitee


From ea034752877efc88756265abb69ff848d210b53b Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 06:26:26 +0000
Subject: [PATCH 3/8] TBuf related api add

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 .../cmatrix_buffer/matmul_l0c_buffer.h        | 143 ++++++
 .../modules/feature_trait/matmul_chip_cap.h   |  71 +++
 .../feature_trait/matmul_feature_trait.h      |  40 ++
 .../feature_trait/matmul_iter_ctrl_cfg.h      |  30 ++
 .../input_cache/matmul_input_l1_cache.h       |  48 ++
 .../iterator/matmul_iterate_controller.h      | 124 +++++
 impl/matmul/modules/matmul_module.h           |  41 ++
 impl/matmul/modules/matmul_params.h           | 470 ++++++++++++++++++
 impl/matmul/modules/matmul_type_def.h         |  33 ++
 9 files changed, 1000 insertions(+)
 create mode 100644 impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h
 create mode 100644 impl/matmul/modules/feature_trait/matmul_chip_cap.h
 create mode 100644 impl/matmul/modules/feature_trait/matmul_feature_trait.h
 create mode 100644 impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
 create mode 100644 impl/matmul/modules/input_cache/matmul_input_l1_cache.h
 create mode 100644 impl/matmul/modules/iterator/matmul_iterate_controller.h
 create mode 100644 impl/matmul/modules/matmul_module.h
 create mode 100644 impl/matmul/modules/matmul_params.h
 create mode 100644 impl/matmul/modules/matmul_type_def.h

diff --git a/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h b/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h
new file mode 100644
index 00000000..90f55078
--- /dev/null
+++ b/impl/matmul/modules/cmatrix_buffer/matmul_l0c_buffer.h
@@ -0,0 +1,143 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_unit_flag_params.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_L0C_BUFFER_H
+#define IMPL_MATMUL_MODULES_MATMUL_L0C_BUFFER_H
+
+#include "../matmul_module.h"
+#include "../matmul_param.h"
+
+namespace matmul {
+
+enum class UNIT_FLAG_CTRL : uint8_t {
+    DISABLE,
+    RESERVED,
+    CHECK,
+    SET,
+};
+
+template <typename ENTITY, bool EN_UNIT_FLAG>
+class MatmulL0CBuffer
+{
+public:
+    template <typename L0cT>
+    __aicore__ inline void InitL0CBuffer(uint32_t lenFactor = 1)
+    {
+        MATMUL_PARAM_VAR.tpipe_->InitBuffer(MATMUL_PARAM_VAR.CO1_, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT));
+    }
+
+    template <typename L0cT>
+    __aicore__ inline LocalTensor<L0cT> LoadL0CTensor() 
+    {
+        return MATMUL_PARAM_VAR.CO1_.template Get<L0cT>();
+    }
+
+    template <typename L0cT>
+    __aicore__ inline LocalTensor<L0cT> AllocL0CLocalTensor()
+    {
+        LocalTensor<L0cT> co1Local;
+        co1Local = MATMUL_PARAM_VAR.cMatrix_;
+        return co1Local;
+    }
+
+    template <typename L0cT>
+    __aicore__ inline void FreeL0CLocalTensor(LocalTensor<L0cT> &co1Local)
+    {}
+
+    __aicore__ inline void ResetL0CEventStatus()
+    {
+        event_t eventIDFixToM = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::FIX_M));
+        SetFlag<HardEvent::FIX_M>(eventIDFixToM);
+        WaitFlag<HardEvent::FIX_M>(eventIDFixToM);
+    }
+
+    __aicore__ inline bool IsL0CLastIter(int l0CIterNum) const
+    {
+        return l0CIterNum == MATMUL_CONST_PARAM_VAR.kIter_ - 1;
+    }
+
+    __aicore__ inline uint8_t GetMmadUnitFlagCtrl(bool isFinalCompute) const
+    {
+        return static_cast<uint8_t>(isFinalCompute ? UNIT_FLAG_CTRL::SET : UNIT_FLAG_CTRL::CHECK);
+    }
+
+    template<typename FIX_PIPE_PARAMS>
+    __aicore__ inline void SetFixpipeUnitFlag(FIX_PIPE_PARAMS& fixpipeParams)
+    {
+        fixpipeParams.unitFlag = FIX_PIPE_UNIT_FLAG;
+    }
+
+private:
+    constexpr static uint8_t FIX_PIPE_UNIT_FLAG = 3;
+};
+
+template <typename ENTITY>
+class MatmulL0CBuffer<ENTITY, false>
+{
+public:
+    template <typename L0cT>
+    __aicore__ inline void InitL0CBuffer(uint32_t lenFactor = 1)
+    {
+        if (MATMUL_PARAM_VAR.tiling_->dbL0C == 2) {
+            MATMUL_PARAM_VAR.tpipe_->InitBuffer(
+                MATMUL_PARAM_VAR.CO1_, 2, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT));
+        } else {
+            MATMUL_PARAM_VAR.tpipe_->InitBuffer(
+                MATMUL_PARAM_VAR.CO1_, 1, lenFactor * MATMUL_PARAM_VAR.baseMN_ * sizeof(L0cT));
+        }
+    }
+
+    template <typename L0cT>
+    __aicore__ inline LocalTensor<L0cT> LoadL0CTensor()
+    {
+        return MATMUL_PARAM_VAR.CO1_.template AllocTensor<L0cT>();
+    }
+
+    template <typename L0cT>
+    __aicore__ inline LocalTensor<L0cT> AllocL0CLocalTensor()
+    {
+        LocalTensor<L0cT> co1Local;
+        MATMUL_PARAM_VAR.CO1_.EnQue(MATMUL_PARAM_VAR.cMatrix_);
+        co1Local = MATMUL_PARAM_VAR.CO1_.template DeQue<L0cT>();
+        return co1Local;
+    }
+
+    template <typename L0cT>
+    __aicore__ inline void FreeL0CLocalTensor(LocalTensor<L0cT> &co1Local)
+    {
+        MATMUL_PARAM_VAR.CO1_.FreeTensor(co1Local);
+    }
+
+    __aicore__ inline void ResetL0CEventStatus()
+    {
+        MATMUL_PARAM_VAR.CO1_.FreeAllEvent();
+    }
+
+    __aicore__ inline bool IsL0CLastIter(int l0CIterNum) const
+    {
+        return false;
+    }
+
+    __aicore__ inline uint8_t GetMmadUnitFlagCtrl(bool isFinalCompute) const
+    {
+        return static_cast<uint8_t>(UNIT_FLAG_CTRL::DISABLE);
+    }
+
+    template<typename FIX_PIPE_PARAMS>
+    __aicore__ inline void SetFixpipeUnitFlag(FIX_PIPE_PARAMS& fixpipeParams)
+    {}
+};
+
+}
+#endif // _MATMUL_L0C_BUFFER_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/feature_trait/matmul_chip_cap.h b/impl/matmul/modules/feature_trait/matmul_chip_cap.h
new file mode 100644
index 00000000..f88bc3e5
--- /dev/null
+++ b/impl/matmul/modules/feature_trait/matmul_chip_cap.h
@@ -0,0 +1,71 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_chip_cap.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H
+#define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H
+
+namespace matmul {
+
+class MatmulChipCap
+{
+public:
+    struct Feature {
+        bool supportUnitFlag;
+    };
+
+    __aicore__ constexpr static const Feature& GetFeatures()
+    {
+        return features[GetChipType()];
+    }
+
+private:
+    enum {
+        CHIP_TYPE_100,
+        CHIP_TYPE_200,
+        CHIP_TYPE_220,
+        CHIP_TYPE_300,
+        CHIP_TYPE_310,
+        CHIP_TYPE_MAX,
+    };
+
+    __aicore__ inline constexpr static uint8_t GetChipType()
+    {
+        #if __CCE_AICORE__ == 100
+            return CHIP_TYPE_100;
+        #elif __CCE_AICORE__ == 200
+            return CHIP_TYPE_200;
+        #elif __CCE_AICORE__ == 220
+            return CHIP_TYPE_220;
+        #elif __CCE_AICORE__ == 300
+            return CHIP_TYPE_300;
+        #elif __CCE_AICORE__ == 310
+            return CHIP_TYPE_310;
+        #else
+            return CHIP_TYPE_MAX;
+        #endif
+    }
+
+private:
+    constexpr static Feature features[CHIP_TYPE_MAX] = {
+    /*       supportUnitFlag */
+    /*100*/ {false,},
+    /*200*/ {false,},
+    /*220*/ {true,},
+    /*300*/ {true,},
+    /*310*/ {true,}
+    };
+};
+
+}
+#endif // _MATMUL_CHIP_CAP_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/feature_trait/matmul_feature_trait.h b/impl/matmul/modules/feature_trait/matmul_feature_trait.h
new file mode 100644
index 00000000..e69100f7
--- /dev/null
+++ b/impl/matmul/modules/feature_trait/matmul_feature_trait.h
@@ -0,0 +1,40 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_feature.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H
+#define IMPL_MATMUL_MODULES_MATMUL_FEATURE_TRAIT_H
+
+#include "../../matmul_utils.h"
+#include "matmul_chip_cap.h"
+#include "matmul_iter_ctrl_cfg.h"
+
+namespace matmul {
+
+template<const MatmulConfig& MM_CFG = CFG_NORM>
+class MatmulFeatureTrait {
+public:
+    static constexpr MatmulIterCtrlCfg iterCtrlCfg {
+        .isFixedStep = DoMatmulSpecialBasicBlock(MM_CFG),
+        .stepM = MM_CFG.stepM,
+        .stepN = MM_CFG.stepN,
+        .iterOrder = IterateOrder::UNDEF,
+    };
+
+    __aicore__ inline constexpr static bool IsUnitFlagEnabled()
+    {
+        return EnUnitFlag(MM_CFG) && MatmulChipCap::GetFeatures().supportUnitFlag;
+    }
+};
+}
+#endif // _MATMUL_FEATURE_TRAIT_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
new file mode 100644
index 00000000..12a7c679
--- /dev/null
+++ b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
@@ -0,0 +1,30 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_iter_ctrl_cfg.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H
+#define IMPL_MATMUL_MODULES_MATMUL_ITER_CTRL_CFG_H
+
+#include "../../../../lib/matmul/tiling.h"
+
+namespace matmul {
+
+struct MatmulIterCtrlCfg {
+    bool isFixedStep;
+    int32_t stepM;
+    int32_t stepN;
+    IterateOrder iterOrder;
+};
+
+}
+#endif // _MATMUL_ITER_CTRL_CFG_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/input_cache/matmul_input_l1_cache.h b/impl/matmul/modules/input_cache/matmul_input_l1_cache.h
new file mode 100644
index 00000000..763c7f71
--- /dev/null
+++ b/impl/matmul/modules/input_cache/matmul_input_l1_cache.h
@@ -0,0 +1,48 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_input_l1_cache.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_INPUT_L1_CACHE_H
+#define IMPL_MATMUL_MODULES_MATMUL_INPUT_L1_CACHE_H
+
+#include "../matmul_module.h"
+
+namespace matmul {
+
+template<typename ENTITY, typename A_TYPE, typename B_TYPE>
+class MatmulInputL1Cache
+{
+public:
+    __aicore__ inline void ClearAL1Cache()
+    {
+        if constexpr (!PhyPosIsL1(A_TYPE::pos)) {
+            if (MATMUL_PARAM_VAR.cacheProcA_ > 0) {
+                MATMUL_PARAM_VAR.qidA1Cache_.FreeTensor(MATMUL_PARAM_VAR.cacheHeadA1_);
+                MATMUL_PARAM_VAR.cacheProcA_ = 0;
+            }
+        }
+    }
+
+    __aicore__ inline void ClearBL1Cache()
+    {
+        if constexpr (!PhyPosIsL1(B_TYPE::pos)) {
+            if (MATMUL_PARAM_VAR.cacheProcB_ > 0) {
+                MATMUL_PARAM_VAR.qidB1Cache_.FreeTensor(MATMUL_PARAM_VAR.cacheHeadB1_);
+                MATMUL_PARAM_VAR.cacheProcB_ = 0;
+            }
+        }
+    }
+};
+
+}
+#endif // _MATMUL_INPUT_L1_CACHE_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/iterator/matmul_iterate_controller.h b/impl/matmul/modules/iterator/matmul_iterate_controller.h
new file mode 100644
index 00000000..24403997
--- /dev/null
+++ b/impl/matmul/modules/iterator/matmul_iterate_controller.h
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file matmul_iterate_controller.h
+ * \brief
+ */
+#ifndef IMPL_MATMUL_MODULES_MATMUL_ITERATOR_CONTROLLER_H
+#define IMPL_MATMUL_MODULES_MATMUL_ITERATOR_CONTROLLER_H
+
+#include "../matmul_module.h"
+#include "../matmul_param.h"
+#include "../feature_trait/matmul_iter_ctrl_cfg.h"
+#include "../input_cache/matmul_input_l1_cache.h"
+
+namespace matmul {
+
+template <typename ENTITY, class A_TYPE, class B_TYPE, const auto& ITER_CFG>
+class MatmulIterateController
+{
+    MATMUL_USE_MODULE(MatmulInputL1Cache, A_TYPE, B_TYPE);
+
+public:
+    __aicore__ inline bool MoveNext()
+    {
+        if (unlikely(MATMUL_PARAM_VAR.isFirstIter_)) {
+            return MoveOnFirstIterate();
+        }
+        if constexpr (ITER_CFG.iterOrder == IterateOrder::UNDEF) {
+            auto& var = MATMUL_PARAM_VAR;
+            if (likely(var.tiling_->iterateOrder == static_cast<int>(IterateOrder::ORDER_M))) {
+                return MoveOnIterateOrderM();
+            } else {
+                ASCENDC_ASSERT((var.tiling_->iterateOrder == static_cast<int>(IterateOrder::ORDER_N)), {
+                    KERNEL_LOG(KERNEL_ERROR, "iterateOrder is %d , which should be ORDER_N",
+                    var.tiling_->iterateOrder);
+                });
+                return MoveOnIterateOrderN();
+            }
+        } else if (ITER_CFG.iterOrder == IterateOrder::ORDER_M) {
+            return MoveOnIterateOrderM();
+        } else {
+            return MoveOnIterateOrderN();
+        }
+    }
+
+    __aicore__ inline void Reset() {
+        MATMUL_PARAM_VAR.isFirstIter_ = true;
+    }
+
+private:
+    __aicore__ inline bool MoveOnFirstIterate()
+    {
+        auto& var = MATMUL_PARAM_VAR;
+        var.isFirstIter_ = false;
+        var.curM_ = 0;
+        var.curN_ = 0;
+        var.stepMIdx_ = 0;
+        var.stepNIdx_ = 0;
+        var.curStepM_ =
+            (var.mIter_ - var.curM_) > var.tiling_->stepM ?
+            var.tiling_->stepM : (var.mIter_ - var.curM_);
+        var.curStepN_ =
+            (var.nIter_ - var.curN_) > var.tiling_->stepN ?
+            var.tiling_->stepN : (var.nIter_ - var.curN_);
+            return true;
+    }
+
+    __aicore__ inline bool MoveOnIterateOrderM()
+    {
+        auto& var = MATMUL_PARAM_VAR;
+        // Output along M axis
+        if (++var.curN_ >= var.stepNIdx_ + var.curStepN_) {
+            MATMUL_MODULE(MatmulInputL1Cache).ClearAL1Cache();
+            var.curN_ = var.stepNIdx_;
+            if (++var.curM_ >= var.mIter_) {
+                MATMUL_MODULE(MatmulInputL1Cache).ClearBL1Cache();
+                var.curM_ = 0;
+                var.stepNIdx_ += var.curStepN_;
+                if (var.stepNIdx_ >= var.nIter_) {
+                    return false;
+                }
+                var.curN_ = var.stepNIdx_;
+                var.curStepN_ =
+                    (var.nIter_ - var.curN_) > var.tiling_->stepN ?
+                    var.tiling_->stepN : (var.nIter_ - var.curN_);
+            }
+        }
+        return true;
+    }
+
+    __aicore__ inline bool MoveOnIterateOrderN()
+    {
+        auto& var = MATMUL_PARAM_VAR;
+        if (++var.curM_ >= var.stepMIdx_ + var.curStepM_) {
+            MATMUL_MODULE(MatmulInputL1Cache).ClearBL1Cache();
+            var.curM_ = var.stepMIdx_;
+            if (++var.curN_ >= var.nIter_) {
+                MATMUL_MODULE(MatmulInputL1Cache).ClearAL1Cache();
+                var.curN_ = 0;
+                var.stepMIdx_ += var.curStepM_;
+                if (var.stepMIdx_ >= var.mIter_) {
+                    return false;
+                }
+                var.curM_ = var.stepMIdx_;
+                var.curStepM_ =
+                    (var.mIter_ - var.curM_) > var.tiling_->stepM ?
+                    var.tiling_->stepM : (var.mIter_ - var.curM_);
+            }
+        }
+        return true;
+    }
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/impl/matmul/modules/matmul_module.h b/impl/matmul/modules/matmul_module.h
new file mode 100644
index 00000000..dd151420
--- /dev/null
+++ b/impl/matmul/modules/matmul_module.h
@@ -0,0 +1,41 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_entity_macro.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_ENTITY_MACRO_H
+#define IMPL_MATMUL_MODULES_MATMUL_ENTITY_MACRO_H
+
+#define MATMUL_ENTITY ENTITY
+
+#define MATMUL_ENTITY_IMPL_TYPE MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB>
+
+#define MATMUL_IMPORT_MODULE(NAME, ...) private NAME<MATMUL_ENTITY_IMPL_TYPE, ##__VA_ARGS__>
+
+#define MATMUL_MODULE_NAME(NAME)  NAME##Module
+
+#define MATMUL_USE_MODULE_OF(NAME, ENTITY, ...) using MATMUL_MODULE_NAME(NAME) = NAME<ENTITY, ##__VA_ARGS__>
+#define MATMUL_USE_MODULE(NAME, ...)  MATMUL_USE_MODULE_OF(NAME, MATMUL_ENTITY, ##__VA_ARGS__)
+
+#define MATMUL_USE_IMPORTED_MODULE(NAME, ...)                           \
+MATMUL_USE_MODULE_OF(NAME, MATMUL_ENTITY_IMPL_TYPE, ##__VA_ARGS__);     \
+friend class NAME<MATMUL_ENTITY_IMPL_TYPE, ##__VA_ARGS__>
+
+#define MATMUL_PARAMS_OF(ENTITY) static_cast<ENTITY*>(this)->var
+#define MATMUL_PARAM_VAR         MATMUL_PARAMS_OF(MATMUL_ENTITY)
+
+#define MATMUL_CONST_PARAMS_OF(ENTITY) ((const ENTITY*)(this))->var
+#define MATMUL_CONST_PARAM_VAR         MATMUL_CONST_PARAMS_OF(MATMUL_ENTITY)
+
+#define MATMUL_MODULE(NAME) (*static_cast<MATMUL_MODULE_NAME(NAME)*>(static_cast<MATMUL_ENTITY*>(this)))
+
+#endif // _MATMUL_ENTITY_MACRO_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/matmul_params.h b/impl/matmul/modules/matmul_params.h
new file mode 100644
index 00000000..d114f1b8
--- /dev/null
+++ b/impl/matmul/modules/matmul_params.h
@@ -0,0 +1,470 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file matmul_impl.h
+ * \brief
+ */
+#ifndef IMPL_MATMUL_MODULES_PARAMS_H
+#define IMPL_MATMUL_MODULES_PARAMS_H
+
+#include "kernel_macros.h"
+
+#include "lib/matmul/tiling.h"
+#include "kernel_operator.h"
+#include "../matmul_utils.h"
+#include "matmul_type_def.h"
+
+namespace matmul {
+/* **************************************************************************************************
+ * MatmulParamsBase                                             *
+ * ************************************************************************************************* */
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsBase {
+    __aicore__ inline MatmulParamsBase() {};
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulParamsNorm() {};
+    using SrcT = typename A_TYPE::T;
+    using SrcBT = typename B_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
+    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
+#if __CCE_AICORE__ < 220
+    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
+    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
+    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
+    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
+
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB1_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1Cache_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB1Cache_;
+#else
+    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1_;
+    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1_;
+    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1Cache_;
+    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1Cache_;
+#endif
+
+    LocalTensor<L0cT> cMatrix_;
+
+    LocalTensor<SrcT> cacheHeadA1_; // Allocate and release using qidA1Cache_
+    LocalTensor<SrcT> cacheHeadB1_; // Allocate and release using qidB1Cache_
+    LocalTensor<BiasT> cacheHeadBias_; // Allocate and release using qidBias_
+
+    SrcT aScalar_;
+    SrcT bScalar_;
+    DEBUG_CODE(int calCount_ = 0);
+
+    TBuffAddr leftMatrix_;
+    TBuffAddr rightMatrix_;
+    TBuffAddr inputBias_;
+
+    __gm__ SrcT* aGlobal_;
+    __gm__ SrcBT* bGlobal_;
+    __gm__ BiasT* biasGlobal_;
+
+    TPipe* tpipe_;
+    const TCubeTiling* __restrict tiling_;
+    __gm__ uint8_t* cacheWorkspaceAddr;
+
+#if __CCE_AICORE__ < 220
+    __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr;
+    LocalTensor<uint8_t> localWorkspace;
+    int nd2nz0ffset = 0;
+    int transOffset = 0;
+    int co2Offset = 0;
+#endif
+
+    int singleCoreM_;
+    int singleCoreN_;
+    int singleCoreK_;
+    // iterate nums in mnk axis
+    int mIter_;
+    int nIter_;
+    int kIter_;
+
+    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
+    // measured in element
+    int baseUseM_;
+    int baseUseK_;
+    int baseUseN_;
+    // measured in cube block
+    int blockUseM_;
+    int blockUseK_;
+    int blockUseN_;
+
+    int32_t cacheProcA_, cacheProcB_;
+    bool isFirstIter_;
+    bool isTransposeA_; // whether A matrix need to transpose
+    bool isTransposeB_; // whether B matrix need to transpose
+    // whether enbale bias, default value is false
+    bool enableBias_;
+
+    int tailM_, tailK_, tailN_;
+    // current c matrix coordinate
+    int curM_, curN_;
+    // current c matrix step size, there could be tail steps
+    int curStepM_, curStepN_;
+    // current c matrix step block coordinate
+    int stepMIdx_, stepNIdx_;
+
+    bool enHF32Mode_;
+    int32_t hf32TransMode_;
+    uint8_t subBlockIdx_;
+
+    int baseMK_;
+    int baseKN_;
+    int baseMN_;
+
+    int cacheA1Size_, cacheB1Size_;
+    int depthA1_, depthB1_;
+#if __CCE_AICORE__ >= 220
+    int sMadMStep_ = 0;
+    int sMadNStep_ = 0;
+#endif
+    uint64_t dataPtr_;
+    uint64_t tilingPtr_;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsNormQuant : public MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
+    __aicore__ inline MatmulParamsNormQuant() {};
+    TQue<TPosition::C1, QUEUE_DEPTH> qidFixPipe_;
+    uint64_t quantScalar_ = 0;
+    GlobalTensor<uint64_t> quantTensor_;
+    // 0: no quant, 1: deqf16, 2: vdeqf16, 3: QF322B8_PRE, 4: VQF322B8_PRE, 5: REQ8(s32->u8/s8), 6: VREQ8(s32->u8/s8)
+    uint8_t quantMode_ = 0;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsMDL : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulParamsMDL() {};
+    using SrcT = typename A_TYPE::T;
+    using SrcBT = typename B_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+
+    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
+    TQue<TPosition::C1, QUEUE_DEPTH> qidFixPipe_;
+    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
+    TQue<TPosition::A1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, A_TYPE::format)> qidA1_;
+    TQue<TPosition::B1, QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, B_TYPE::format)> qidB1_;
+#if __CCE_AICORE__ < 220
+    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
+    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
+    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
+    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
+
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA12UBCache_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(B_TYPE::pos), false>::QUE qidB12UBCache_;
+#endif
+
+    LocalTensor<L0cT> cMatrix_;
+
+    LocalTensor<SrcT> cacheA1Ping_;
+    LocalTensor<SrcT> cacheA1Pong_;
+    LocalTensor<SrcT> cacheB1Ping_;
+    LocalTensor<SrcT> cacheB1Pong_;
+    bool cacheA1IsCachingPing_;
+    bool cacheA1IsCachingPong_;
+    bool cacheB1IsCachingPing_;
+    bool cacheB1IsCachingPong_;
+
+    DEBUG_CODE(int calCount_ = 0);
+
+    TBuffAddr leftMatrix_;
+    TBuffAddr rightMatrix_;
+    TBuffAddr inputBias_;
+
+    __gm__ SrcT* aGlobal_;
+    __gm__ SrcBT* bGlobal_;
+    __gm__ BiasT* biasGlobal_;
+
+    TPipe* tpipe_;
+    const TCubeTiling* __restrict tiling_;
+    __gm__ uint8_t* cacheWorkspaceAddr;
+
+#if __CCE_AICORE__ < 220
+    __ubuf__ uint8_t* cacheUBWorkspaceAddr = nullptr;
+    LocalTensor<uint8_t> localWorkspace;
+    LocalTensor<SrcT> cacheHeadA12UB_; // Allocate and release using qidA12UBCache_
+    LocalTensor<SrcT> cacheHeadB12UB_; // Allocate and release using qidB12UBCache_
+    int nd2nz0ffset = 0;
+    int transOffset = 0;
+    int co2Offset = 0;
+    int32_t cacheA12UBProcA_ = 0;
+    int32_t cacheB12UBProcB_ = 0;
+#endif
+
+    int singleCoreM_;
+    int singleCoreN_;
+    int singleCoreK_;
+    // iterate nums in mnk axis
+    int mIter_;
+    int nIter_;
+    int kIter_;
+    // iterate nums in mn step axis
+    int mStepIter_;
+    int nStepIter_;
+    int kaStepIter_;
+    int kbStepIter_;
+    int kStepIter_;
+    int minStepK_;
+    int kaStepFactor_;
+    int kbStepFactor_;
+
+    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
+    // in unit of element
+    int baseUseM_;
+    int baseUseK_;
+    int baseUseN_;
+    // in unit of cube block
+    int blockUseM_;
+    int blockUseK_;
+    int blockUseN_;
+
+    // in unit of element
+    int baseUseStepM_;
+    int baseUseStepN_;
+    int baseUseStepKa_;
+    int baseUseStepKb_;
+    // in unit of cube block
+    int blockUseStepM_;
+    int blockUseStepN_;
+    int blockUseStepKa_;
+    int blockUseStepKb_;
+
+    bool isFirstIter_;
+    bool isTransposeA_; // whether A matrix need to transpose
+    bool isTransposeB_; // whether B matrix need to transpose
+    // whether enbale bias, default value is false
+    bool enableBias_;
+
+    // in unit of element
+    int tailM_, tailK_, tailN_;
+    // in unit of element
+    int tailStepM_, tailStepN_, tailStepKa_, tailStepKb_;
+    // current c matrix coordinate, in unit of baseMN
+    int curM_, curN_;
+    // current c matrix step size, in unit of baseMNK , there could be tail steps
+    int curStepM_, curStepN_;
+    // current c matrix step block coordinate, in unit of stepMNK
+    int stepMIdx_, stepNIdx_, stepKaIdx_, stepKbIdx_;
+
+    // stepKa == kIter
+    bool isA1KFullLoad_, isB1KFullLoad_;
+
+    bool enHF32Mode_;
+    int32_t hf32TransMode_;
+    uint8_t subBlockIdx_;
+
+    int baseMK_;
+    int baseKN_;
+    int baseMN_;
+    int cacheA1Factor_, cacheB1Factor_;
+    uint64_t quantScalar_ = 0;
+    uint64_t dataPtr_;
+    uint64_t tilingPtr_;
+    GlobalTensor<uint64_t> quantTensor_;
+    // 0: no quant, 1: deqf16, 2: vdeqf16;
+    uint8_t quantMode_ = 0;
+    // anti quant param.
+    SrcT antiQuantOffsetScalar_;
+    SrcT antiQuantScaleScalar_;
+    LocalTensor<SrcT> antiQuantOffsetTensor_;
+    LocalTensor<SrcT> antiQuantScaleTensor_;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsBasicBlock : public MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
+    __aicore__ inline MatmulParamsBasicBlock() {};
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParamsIBShareNorm : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> {
+    using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
+    __aicore__ inline MatmulParamsIBShareNorm() {};
+    using SrcT = typename A_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+    TQue<TPosition::C1, QUEUE_DEPTH> qidBias_;
+    typename L0cType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, EnUnitFlag(MM_CFG)>::BUFFER CO1_;
+
+    TQue<TPosition::A2, QUEUE_DEPTH> qidA2_;
+    TQue<TPosition::B2, QUEUE_DEPTH> qidB2_;
+    TQue<TPosition::VECIN, QUEUE_DEPTH> qidVecIn_;
+    TQue<TPosition::CO2, QUEUE_DEPTH> qidCO2_;
+
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidA1Cache_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidB1_;
+    typename QidType<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, PhyPosIsL1(A_TYPE::pos), true>::QUE qidB1Cache_;
+
+    LocalTensor<L0cT> cMatrix_;
+
+    LocalTensor<SrcT> cacheHeadA1_; // Allocate and release using qidA1Cache_
+    LocalTensor<SrcT> cacheHeadB1_; // Allocate and release using qidB1Cache_
+    LocalTensor<BiasT> cacheHeadBias_; // Allocate and release using qidBias_
+
+    SrcT aScalar_;
+    SrcT bScalar_;
+    DEBUG_CODE(int calCount_ = 0);
+
+    TBuffAddr leftMatrix_;
+    TBuffAddr rightMatrix_;
+    TBuffAddr inputBias_;
+
+    __gm__ SrcT* aGlobal_;
+    __gm__ SrcT* bGlobal_;
+    __gm__ BiasT* biasGlobal_;
+
+    TPipe* tpipe_;
+    const TCubeTiling* __restrict tiling_;
+    __gm__ uint8_t* cacheWorkspaceAddr;
+
+    int singleCoreM_;
+    int singleCoreN_;
+    int singleCoreK_;
+    // iterate nums in mnk axis
+    int mIter_;
+    int nIter_;
+    int kIter_;
+
+    // baseUseX_ is the same as baseX in most cases, while it will be smaller than baseX when dealing with tail cases
+    // measured in element
+    int baseUseM_;
+    int baseUseK_;
+    int baseUseN_;
+    // measured in cube block
+    int blockUseM_;
+    int blockUseK_;
+    int blockUseN_;
+
+    int32_t cacheProcA_, cacheProcB_;
+    bool isFirstIter_;
+    bool isTransposeA_; // whether A matrix need to transpose
+    bool isTransposeB_; // whether B matrix need to transpose
+    // whether enbale bias, default value is false
+    bool enableBias_;
+
+    int tailM_, tailK_, tailN_;
+    // current c matrix coordinate
+    int curM_, curN_;
+    // current c matrix step size, there could be tail steps
+    int curStepM_, curStepN_;
+    // current c matrix step block coordinate
+    int stepMIdx_, stepNIdx_;
+
+    bool enHF32Mode_;
+    int32_t hf32TransMode_;
+    uint8_t subBlockIdx_;
+
+    int baseMK_;
+    int baseKN_;
+    int baseMN_;
+
+    int cacheA1Size_, cacheB1Size_;
+    int depthA1_, depthB1_;
+    uint64_t dataPtr_;
+    uint64_t tilingPtr_;
+
+    int curCacheIdx_;
+    GlobalCache gL1GroupCache0_;
+    GlobalCache gL1GroupCache1_;
+};
+
+/* **************************************************************************************************
+ * MatmulParams                                             *
+ * ************************************************************************************************* */
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, MatmulVersion MM_VER,
+    class ENABLE_QUANT = void>
+struct MatmulParams {
+    __aicore__ inline MatmulParams(){};
+};
+
+// CFG_NORM
+#if __CCE_AICORE__ >= 220
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
+    typename std::enable_if<!(((
+        IsSameType<typename A_TYPE::T, int8_t>::value ||
+        IsSameType<typename A_TYPE::T, int4b_t>::value) &&
+        IsSameType<typename C_TYPE::T, half>::value) ||
+        (IsSameType<typename A_TYPE::T, int8_t>::value &&
+        (IsSameType<typename C_TYPE::T, int8_t>::value ||
+        IsSameType<typename C_TYPE::T, uint8_t>::value)))>::type> {
+    __aicore__ inline MatmulParams(){};
+    using PARAMS = MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+#else
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
+    typename std::enable_if<!(
+        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, half>::value) ||
+        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, int8_t>::value))>::type> {
+    __aicore__ inline MatmulParams(){};
+    using PARAMS = MatmulParamsNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+#endif
+
+#if __CCE_AICORE__ >= 220
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
+    typename std::enable_if<(((IsSameType<typename A_TYPE::T, int8_t>::value ||
+                            IsSameType<typename A_TYPE::T, int4b_t>::value) &&
+                            IsSameType<typename C_TYPE::T, half>::value) ||
+                            (IsSameType<typename A_TYPE::T, int8_t>::value &&
+                            (IsSameType<typename C_TYPE::T, int8_t>::value ||
+                            IsSameType<typename C_TYPE::T, uint8_t>::value)))>::type> {
+    __aicore__ inline MatmulParams(){};
+    using PARAMS = MatmulParamsNormQuant<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+#else
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_NORM),
+    typename std::enable_if<(
+        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, half>::value) ||
+        (IsSameType<typename A_TYPE::T, int8_t>::value && IsSameType<typename C_TYPE::T, int8_t>::value))>::type> {
+    __aicore__ inline MatmulParams(){};
+    using PARAMS = MatmulParamsNormQuant<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+#endif
+
+// CFG_MDL
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_MDL)> {
+    __aicore__ inline MatmulParams() {};
+    using PARAMS = MatmulParamsMDL<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+
+// MM_CFG_BB
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(MM_CFG_BB)> {
+    __aicore__ inline MatmulParams() {};
+    using PARAMS = MatmulParamsBasicBlock<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+
+// CFG_IBSHARE_NORM
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG>
+struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(CFG_IBSHARE_NORM)> {
+    __aicore__ inline MatmulParams() {};
+    using PARAMS = MatmulParamsIBShareNorm<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/impl/matmul/modules/matmul_type_def.h b/impl/matmul/modules/matmul_type_def.h
new file mode 100644
index 00000000..88992530
--- /dev/null
+++ b/impl/matmul/modules/matmul_type_def.h
@@ -0,0 +1,33 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_type_def.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H
+#define IMPL_MATMUL_MODULES_MATMUL_TYPE_DEF_H
+
+#include "lib/matmul/tiling.h"
+
+namespace matmul {
+template <TPosition POSITION, CubeFormat FORMAT, typename TYPE, bool ISTRANS = false,
+          LayoutMode LAYOUT = LayoutMode::NONE, bool IBSHARE = false>
+struct MatmulType {
+    constexpr static TPosition pos = POSITION;
+    constexpr static CubeFormat format = FORMAT;
+    using T = TYPE;
+    constexpr static bool isTrans = ISTRANS;
+    constexpr static LayoutMode layout = LAYOUT;
+    constexpr static bool ibShare = IBSHARE;
+};
+
+}
+#endif // _MATMUL_TYPE_DEF_H_
\ No newline at end of file
-- 
Gitee


From 0a84b319e90f0cc78162bb2b78149980c6c2c999 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 06:39:38 +0000
Subject: [PATCH 4/8] rename impl/matmul/modules/matmul_params.h to
 impl/matmul/modules/matmul_param.h.

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 impl/matmul/modules/{matmul_params.h => matmul_param.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename impl/matmul/modules/{matmul_params.h => matmul_param.h} (100%)

diff --git a/impl/matmul/modules/matmul_params.h b/impl/matmul/modules/matmul_param.h
similarity index 100%
rename from impl/matmul/modules/matmul_params.h
rename to impl/matmul/modules/matmul_param.h
-- 
Gitee


From e14d1c3fce56310bc60d63ea47c569a1975dbf64 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 06:55:32 +0000
Subject: [PATCH 5/8] del fmod not support

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 tests/tiling/test_tiling.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index d9cfa951..f2f78fe3 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -1194,28 +1194,6 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf)
     EXPECT_EQ(extraBuf, 0);
 }
 
-TEST_F(TestTiling, TestFmodTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto fmodShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFmodMaxMinTmpSize(fmodShape, 4, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
-}
-
-TEST_F(TestTiling, TestFmodTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto fmodShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFmodMaxMinTmpSize(fmodShape, 2, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 128 * 3 * 4);
-    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
-}
-
 TEST_F(TestTiling, TestTruncTilingFloat)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
-- 
Gitee


From c9e41e9946875ae6c0ff11157dee69841fdea7a1 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 07:04:11 +0000
Subject: [PATCH 6/8] add llt macro def

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 tests/matmul/matmul_module_test_def.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 tests/matmul/matmul_module_test_def.h

diff --git a/tests/matmul/matmul_module_test_def.h b/tests/matmul/matmul_module_test_def.h
new file mode 100644
index 00000000..8448a21d
--- /dev/null
+++ b/tests/matmul/matmul_module_test_def.h
@@ -0,0 +1,25 @@
+/**
+* Copyright (c) 2024 Huawei Technologies Co., Ltd.
+* This file is a part of the CANN Open Software.
+* Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+* Please refer to the License for details. You may not use this file except in compliance with the License.
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+* See LICENSE in the root of the software repository for the full text of the License.
+*/
+
+/*!
+* \file matmul_module_test_def.h
+* \brief
+*/
+#ifndef IMPL_MATMUL_MODULES_MATMUL_MODULE_TEST_DEF_H
+#define IMPL_MATMUL_MODULES_MATMUL_MODULE_TEST_DEF_H
+
+
+#ifdef MATMUL_IMPORT_MODULE
+#undef MATMUL_IMPORT_MODULE
+#define MATMUL_IMPORT_MODULE(NAME, ...) public NAME<MATMUL_ENTITY_IMPL_TYPE, ##__VA_ARGS__>
+#endif
+
+
+#endif // _MATMUL_MODULE_TEST_DEF_H_
\ No newline at end of file
-- 
Gitee


From f0733f1e511d3d65f5524262bbaee2d0a76c0ef6 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 07:27:49 +0000
Subject: [PATCH 7/8] restore for tiling api

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 tests/tiling/test_tiling.cpp | 1924 +---------------------------------
 1 file changed, 14 insertions(+), 1910 deletions(-)

diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index f2f78fe3..219b8605 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -77,25 +77,6 @@ TEST_F(TestTiling, PlatformConstructor)
     EXPECT_EQ(ret, 0);
 }
 
-TEST_F(TestTiling, TestInt4BaseK)
-{
-    matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288,
-        .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536};
-    MatmulApiTiling tiling(plat);
-    tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
-    tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
-    tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32);
-    tiling.SetBias(false);
-    tiling.SetShape(144, 256, 32);
-    tiling.SetOrgShape(144, 256, 32);
-    tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1);
-    optiling::TCubeTiling tilingData;
-    int ret = tiling.GetTiling(tilingData);
-    tiling.PrintTilingData();
-    EXPECT_EQ(tilingData.get_baseK() % 64, 0);
-    EXPECT_EQ(ret, 0);
-}
-
 TEST_F(TestTiling, Tiling_310p_NotAligned)
 {
     matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576,
@@ -742,105 +723,6 @@ TEST_F(TestTiling, TestSetBufferSpace)
     EXPECT_EQ(tiling.bufferPool_.l1Size, 1024);
 }
 
-TEST_F(TestTiling, TestCosTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto cosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 3);
-    AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 2);
-    uint32_t maxLiveNodeCnt = 0;
-    uint32_t extraBuf = 0;
-    GetCosTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestCosTilingFloat512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto cosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetCosMaxMinTmpSize(cosShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 3);
-    AscendC::GetCosMaxMinTmpSize(cosShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestCosTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto cosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetCosMaxMinTmpSize(cosShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 8 * 2);
-    EXPECT_EQ(minValue, 256 * 8);
-    uint32_t maxLiveNodeCnt = 0;
-    uint32_t extraBuf = 0;
-    GetCosTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 8);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAtanTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto atanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAtanMaxMinTmpSize(atanShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 5);
-    EXPECT_EQ(minValue, 256 * 5);
-}
-
-TEST_F(TestTiling, TestAtanTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto atanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAtanMaxMinTmpSize(atanShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 12);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 12);
-    uint32_t maxLiveNodeCnt = 0;
-    uint32_t extraBuf = 0;
-    GetAtanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 12);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestClampTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto atanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetClampMaxMinTmpSize(atanShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 1);
-    EXPECT_EQ(minValue, 64 * 1);
-}
-
-TEST_F(TestTiling, TestClampTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto atanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetClampMaxMinTmpSize(atanShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 1);
-    uint32_t maxLiveNodeCnt = 0;
-    uint32_t extraBuf = 0;
-    GetClampTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
 TEST_F(TestTiling, TestSoftMaxTiling)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
@@ -954,6 +836,7 @@ TEST_F(TestTiling, TestSoftMaxFlashV2Tiling)
     SoftMaxFlashV2TilingFunc(softmaxShape, inputTypeSize, maxSumTypeSize, workLength, tilingData, true, true);
     EXPECT_EQ(tilingData.get_reduceM(), 64);
 }
+
 TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock)
 {
     std::vector<int64_t> shapeDims = { 8, 1024 };
@@ -982,163 +865,6 @@ TEST_F(TestTiling, TestSoftMaxFlashV2TilingBasicBlock)
     EXPECT_EQ(tilingData.get_reduceM(), 8);
 }
 
-TEST_F(TestTiling, TestAsinTmpBufferFacotrHalfWithoutBasicBlock) {
-    uint32_t maxLivedNodes = 0xffff;
-    uint32_t extraBuffer = 0xffff;
-    GetAsinTmpBufferFactorSize(2, maxLivedNodes, extraBuffer);
-    EXPECT_EQ(maxLivedNodes, 6);
-    EXPECT_EQ(extraBuffer, 0);
-}
-
-TEST_F(TestTiling, TestAsinTmpBufferFacotrFloatWithoutBasicBlock) {
-    uint32_t maxLivedNodes = 0xffff;
-    uint32_t extraBuffer = 0xffff;
-    GetAsinTmpBufferFactorSize(4, maxLivedNodes, extraBuffer);
-    EXPECT_EQ(maxLivedNodes, 2);
-    EXPECT_EQ(extraBuffer, 0);
-}
-
-TEST_F(TestTiling, TestAsinTilingHalf128)
-{
-    std::vector<int64_t> shapeDims = { 128 };
-    auto asinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 256 * 6);
-    EXPECT_EQ(minValue, 256 * 6);
-}
-
-TEST_F(TestTiling, TestAsinTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 32 };
-    auto asinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 256 * 2);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestAsinTilingHalf16K)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto asinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAsinMaxMinTmpSize(asinShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 6 * 2);
-    EXPECT_EQ(minValue, 256 * 6);
-}
-
-TEST_F(TestTiling, TestAsinTilingFloat16K)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto asinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAsinMaxMinTmpSize(asinShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestSinhTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto sinhShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetSinhMaxMinTmpSize(sinhShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetSinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestSinhTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto sinhShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetSinhMaxMinTmpSize(sinhShape, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 4);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetSinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestRoundTiling)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto tanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    uint32_t maxLiveNodeCnt = 0;
-    uint32_t extraBuf = 0;
-    platform_ascendc::SocVersion socVersion = plat.GetSocVersion();
-    GetRoundMaxMinTmpSize(plat, tanShape, 4, false, maxValue, minValue);
-    GetRoundTmpBufferFactorSize(plat, 4, maxLiveNodeCnt, extraBuf);
-    GetRoundMaxMinTmpSize(plat, tanShape, 2, false, maxValue, minValue);
-    GetRoundTmpBufferFactorSize(plat, 2, maxLiveNodeCnt, extraBuf);
-}
-
-TEST_F(TestTiling, TestTanTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto tanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 4);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestTanTilingFloat512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto tanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTanMaxMinTmpSize(tanShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 4);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetTanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestTanTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto tanShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTanMaxMinTmpSize(tanShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 10 * 2);
-    EXPECT_EQ(minValue, 256 * 10);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetTanTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 10);
-    EXPECT_EQ(extraBuf, 0);
-}
-
 TEST_F(TestTiling, TEstSwiGLUTilingHalf)
 {
     std::vector<int64_t> shapeDims = {10, 512};
@@ -1194,110 +920,6 @@ TEST_F(TestTiling, TestSwiGLUFactorHalf)
     EXPECT_EQ(extraBuf, 0);
 }
 
-TEST_F(TestTiling, TestTruncTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto truncShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTruncMaxMinTmpSize(truncShape, 4, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetTruncTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestTruncTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto truncShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTruncMaxMinTmpSize(truncShape, 2, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetTruncTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 2);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestTruncTilingHalf512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto truncShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetTruncMaxMinTmpSize(truncShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 512 * 2 * 2);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestAcosTmpBufferFacotrHalfWithoutBasicBlock) {
-    uint32_t maxLivedNodes = 0xffff;
-    uint32_t extraBuffer = 0xffff;
-    GetAcosTmpBufferFactorSize(2, maxLivedNodes, extraBuffer);
-    EXPECT_EQ(maxLivedNodes, 6);
-    EXPECT_EQ(extraBuffer, 0);
-}
-
-
-TEST_F(TestTiling, TestAcosTmpBufferFacotrFloatWithoutBasicBlock) {
-    uint32_t maxLivedNodes = 0xffff;
-    uint32_t extraBuffer = 0xffff;
-    GetAcosTmpBufferFactorSize(4, maxLivedNodes, extraBuffer);
-    EXPECT_EQ(maxLivedNodes, 2);
-    EXPECT_EQ(extraBuffer, 0);
-}
-
-TEST_F(TestTiling, TestAcosTilingHalf128)
-{
-    std::vector<int64_t> shapeDims = { 128 };
-    auto acosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 6);
-    EXPECT_EQ(maxValue, 256 * 6);
-}
-
-TEST_F(TestTiling, TestAcosTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 32 };
-    auto acosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-    EXPECT_EQ(maxValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestTanhTiling)
-{
-    uint32_t maxVal = 0;
-    uint32_t minVal = 0;
-    GetTanhMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 128 * 4 * 1);
-    EXPECT_EQ(minVal, 256 * 1);
-    GetTanhMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 4);
-    EXPECT_EQ(minVal, 256 * 4);
-    uint32_t extraBuf = 123;
-    uint32_t maxLivedNodesCnt = 123;
-    GetTanhTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf);
-    EXPECT_EQ(extraBuf, 0);
-    EXPECT_EQ(maxLivedNodesCnt, 1);
-    GetTanhTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf);
-    EXPECT_EQ(extraBuf, 0);
-    EXPECT_EQ(maxLivedNodesCnt, 4);
-}
-
 TEST_F(TestTiling, TestSigmoidTiling)
 {
     std::vector<int64_t> shapeDims = { 128 };
@@ -1309,329 +931,28 @@ TEST_F(TestTiling, TestSigmoidTiling)
     EXPECT_EQ(minVal, 256);
 }
 
-TEST_F(TestTiling, TestLogTilingMaxMin)
-{
-    std::vector<int64_t> shapeDims = { 128 };
-    auto logShape = ge::Shape(shapeDims);
-    uint32_t maxVal;
-    uint32_t minVal;
-    GetLogMaxMinTmpSize(logShape, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 0);
-    EXPECT_EQ(minVal, 0);
-    GetLog2MaxMinTmpSize(logShape, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 0);
-    EXPECT_EQ(minVal, 0);
-    GetLog2MaxMinTmpSize(logShape, 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 4 * 128);
-    EXPECT_EQ(minVal, 256);
-    GetLog10MaxMinTmpSize(logShape, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 0);
-    EXPECT_EQ(minVal, 0);
-}
-
-TEST_F(TestTiling, TestLogTilingFactor)
-{
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetLogTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 0);
-    EXPECT_EQ(extraBuf, 0);
-    GetLog10TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 0);
-    EXPECT_EQ(extraBuf, 0);
-    GetLog2TmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 2);
-    EXPECT_EQ(extraBuf, 0);
-    GetLog2TmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 0);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestPowerTiling)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto powerShape = ge::Shape(shapeDims);
-    uint32_t maxVal;
-    uint32_t minVal;
-    GetPowerMaxMinTmpSize(powerShape, powerShape, false, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 4 * 4 + 256);
-    EXPECT_EQ(minVal, 256 * 4 + 256);
-    GetPowerMaxMinTmpSize(powerShape, powerShape, true, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 4 * 6);
-    EXPECT_EQ(minVal, 256 * 6);
-    GetPowerMaxMinTmpSize(powerShape, powerShape, false, 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 2 * 14 + 256);
-    EXPECT_EQ(minVal, 256 * 7 + 256);
-    std::vector<int64_t> scalar_shape = { 1 };
-    auto scalarShape = ge::Shape(scalar_shape);
-    GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 2 * 14 + 256);
-    EXPECT_EQ(minVal, 256 * 7 + 256);
-    GetPowerMaxMinTmpSize(powerShape, scalarShape, true, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 4 * 7);
-    EXPECT_EQ(minVal, 256 * 7);
-    GetPowerMaxMinTmpSize(powerShape, scalarShape, false, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 512 * 4 * 5 + 256);
-    EXPECT_EQ(minVal, 256 * 5 + 256);
-
-    std::vector<int64_t> shape1 = { 16 };
-    auto powerShape1 = ge::Shape( shape1 );
-    GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 5 + 256);
-    EXPECT_EQ(minVal, 256 * 5 + 256);
-    GetPowerMaxMinTmpSize(powerShape1, scalarShape, false, 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 7 + 256);
-    EXPECT_EQ(minVal, 256 * 7 + 256);
-    GetPowerMaxMinTmpSize(powerShape1, scalarShape, true, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 7);
-    EXPECT_EQ(minVal, 256 * 7);
-    GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 4 + 256);
-    EXPECT_EQ(minVal, 256 * 4 + 256);
-    GetPowerMaxMinTmpSize(powerShape1, powerShape1, false, 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 7 + 256);
-    EXPECT_EQ(minVal, 256 * 7 + 256);
-    GetPowerMaxMinTmpSize(powerShape1, powerShape1, true, 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 6);
-    EXPECT_EQ(minVal, 256 * 6);
-}
-
-TEST_F(TestTiling, TestPowerTilingFactorSize)
+TEST_F(TestTiling, TestLayernormTiling)
 {
-    uint32_t maxLiveNodeCnt = 0xffff;
-    uint32_t extraBuf = 0xffff;
-    GetPowerTmpBufferFactorSize(false, true, false, 4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 5);
-    EXPECT_EQ(extraBuf, 256);
-    GetPowerTmpBufferFactorSize(false, true, true, 4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 7);
-    EXPECT_EQ(extraBuf, 0);
-    GetPowerTmpBufferFactorSize(false, true, false, 2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 14);
-    EXPECT_EQ(extraBuf, 256);
-    GetPowerTmpBufferFactorSize(true, true, false, 4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 256);
-    GetPowerTmpBufferFactorSize(true, true, true, 4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 6);
-    EXPECT_EQ(extraBuf, 0);
-    GetPowerTmpBufferFactorSize(true, true, false, 2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 14);
-    EXPECT_EQ(extraBuf, 256);
-}
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 4;
 
-TEST_F(TestTiling, TestAcosTilingHalf16K)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto acosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetAcosMaxMinTmpSize(acosShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 6 * 2);
-    EXPECT_EQ(minValue, 256 * 6);
-}
+    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
+    auto layernormShape = ge::Shape(shapeDims);
+    const bool isReuseSource = false;
+    optiling::LayerNormTiling tilling;
 
-TEST_F(TestTiling, TestAcosTilingFloat16K)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto acosShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
     uint32_t minValue = 0;
-    GetAcosMaxMinTmpSize(acosShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestAsinhTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto asinhShape = ge::Shape(shapeDims);
     uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAsinhMaxMinTmpSize(asinhShape, 4, true,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 3);
-    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
 
-    AscendC::GetAsinhMaxMinTmpSize(ge::Shape({32}), 4, true,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 3);
-    EXPECT_EQ(maxValue, 256 * 3);
+    AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue);
+    EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize);
+    EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize);
 
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetAsinhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
+    AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling);
+    EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
 }
 
-TEST_F(TestTiling, TestAsinhTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto asinhShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAsinhMaxMinTmpSize(asinhShape, 2, true,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 3);
-    EXPECT_EQ(maxValue, 128 * 128 * 3 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetAsinhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAcoshTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto acoshShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAcoshMaxMinTmpSize(acoshShape, 2, true,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetAcoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 2);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAcoshTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto acoshShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAcoshMaxMinTmpSize(acoshShape, 4, true,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetAcoshTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxis)
-{
-    const auto shape = ge::Shape({ 8, 128 });
-    const auto scalarShape = ge::Shape({1});
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisLargeShape)
-{
-    const auto shape = ge::Shape({ 128, 128 });
-    const auto scalarShape = ge::Shape({1});
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 4096 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 4096 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingSameAxisSmallShape)
-{
-    const auto shape = ge::Shape({ 1, 16 });
-    const auto scalarShape = ge::Shape({1});
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(shape, scalarShape, 2, shape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024);
-    EXPECT_EQ(maxValue, 1024);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, shape, 2, shape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024);
-    EXPECT_EQ(maxValue, 1024);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxis)
-{
-    const auto srcShape = ge::Shape({ 8, 128 });
-    const auto scalarShape = ge::Shape({1});
-    const auto maskShape = ge::Shape({ 8, 160 });
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 8 * 2 + 512 + 8 * 128);
-    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512 + 8 * 128);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 128 * 8 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 8 * 2 + 512);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisLargeShape)
-{
-    const auto srcShape = ge::Shape({ 128, 128 });
-    const auto scalarShape = ge::Shape({1});
-    const auto maskShape = ge::Shape({ 128, 160 });
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 4096 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 4096 * 2 + 512 + 128 * 128);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512 + 128 * 128);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 4096 * 2 + 512);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 + 512);
-}
-
-TEST_F(TestTiling, TestSelectWithBytesMaskTilingDiffAxisSmallShape)
-{
-    const auto srcShape = ge::Shape({ 1, 16 });
-    const auto scalarShape = ge::Shape({1});
-    const auto maskShape = ge::Shape({ 1, 32 });
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024);
-    EXPECT_EQ(maxValue, 1024);
-    GetSelectWithBytesMaskMaxMinTmpSize(srcShape, scalarShape, 2, maskShape, 1, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024 + 32);
-    EXPECT_EQ(maxValue, 1024 + 32);
-    GetSelectWithBytesMaskMaxMinTmpSize(scalarShape, srcShape, 2, maskShape, 1, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024);
-    EXPECT_EQ(maxValue, 1024);
-}
-
-TEST_F(TestTiling, TestLayernormTiling)
-{
-    const uint32_t stackBufferSize = 100 * 1024;
-    const uint32_t typeSize = 4;
-
-    std::vector<int64_t> shapeDims = { 128, 128, 128, 128, 128, 128 };
-    auto layernormShape = ge::Shape(shapeDims);
-    const bool isReuseSource = false;
-    optiling::LayerNormTiling tilling;
-
-    uint32_t minValue = 0;
-    uint32_t maxValue = 0;
-
-    AscendC::GetLayerNormMaxMinTmpSize(layernormShape, typeSize, isReuseSource, maxValue, minValue);
-    EXPECT_EQ(maxValue, 3 * (128 * 128 * 128) * typeSize + 2 * (128 * 128) * typeSize);
-    EXPECT_EQ(minValue, 3 * 128 * typeSize + 2 * (128 * 128) * typeSize);
-
-    AscendC::GetLayerNormNDTillingInfo(layernormShape, stackBufferSize, typeSize, isReuseSource, tilling);
-    EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
-}
-
-TEST_F(TestTiling, TestRmsnormTiling)
+TEST_F(TestTiling, TestRmsnormTiling)
 {
     constexpr uint32_t bLength = 4;
     constexpr uint32_t sLength = 32;
@@ -1911,39 +1232,6 @@ TEST_F(TestTiling, TestDeepnormTiling)
     EXPECT_EQ(tiling.get_oneTmpSize(), 512);
 }
 
-TEST_F(TestTiling, TestExpTiling)
-{
-    std::vector<int64_t> shapeDims = {128, 128};
-    auto expShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-
-    // float   isReuseSrc = false  3 tmpBuffer
-    AscendC::GetExpMaxMinTmpSize(expShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 3 * 256);
-    EXPECT_EQ(maxValue, 3 * 128 * 128 * 4);
-    // float   isReuseSrc = true   2 tmpBuffer
-    AscendC::GetExpMaxMinTmpSize(expShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 2 * 256);
-    EXPECT_EQ(maxValue, 2 * 128 * 128 * 4);
-    // half    4 tmpBuffer
-    AscendC::GetExpMaxMinTmpSize(expShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 4 * 256);
-    EXPECT_EQ(maxValue, 4 * 128 * 128 * 4);
-    AscendC::GetExpMaxMinTmpSize(expShape, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 4 * 256);
-    EXPECT_EQ(maxValue, 4 * 128 * 128 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetExpTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
-    AscendC::GetExpTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 8);
-    EXPECT_EQ(extraBuf, 0);
-}
-
 TEST_F(TestTiling, TestMatmulApiTilngFactorSplit1)
 {
     MatmulApiTiling tiling;
@@ -2285,7 +1573,6 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM4)
     EXPECT_EQ(res, 0);
 }
 
-
 TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadCase)
 {
     optiling::TCubeTiling tilingData;
@@ -2327,59 +1614,6 @@ TEST_F(TestTiling, TestMatmulApiTilngMultiCoreBTSCM5)
     EXPECT_EQ(res, 0);
 }
 
-TEST_F(TestTiling, TestConcatTiling)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    const uint32_t elemCount = 128;
-
-    AscendC::GetConcatTmpSize(plat, elemCount, 2);
-}
-
-TEST_F(TestTiling, TestSortTiling)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    const uint32_t elemCount = 128;
-
-    AscendC::GetSortTmpSize(plat, elemCount, 4);
-}
-
-TEST_F(TestTiling, TestUnPadTiling)
-{
-    const uint32_t stackBufferSize = 100 * 1024;
-    const uint32_t typeSize = 4;
-
-    std::vector<int64_t> shapeDims = { 32, 32 };
-    auto srcShape = ge::Shape(shapeDims);
-    optiling::UnPadTiling tiling;
-
-    AscendC::UnPadTilingFunc(srcShape, 0, typeSize, tiling);
-    AscendC::UnPadTilingFunc(srcShape, stackBufferSize, typeSize, tiling);
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetUnPadMaxMinTmpSize(plat, srcShape, typeSize, maxValue, minValue);
-}
-
-TEST_F(TestTiling, TestPadTiling)
-{
-    const uint32_t stackBufferSize = 100 * 1024;
-    const uint32_t typeSize = 4;
-
-    std::vector<int64_t> shapeDims = { 32, 32};
-    std::vector<int64_t> ori_shape_dims = { 32, 31 };
-    auto srcShape = ge::Shape(shapeDims);
-    auto oriSrcShape = ge::Shape(ori_shape_dims);
-    optiling::PadTiling tiling;
-
-    AscendC::PadTilingFunc(srcShape, oriSrcShape, stackBufferSize, typeSize, tiling);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetPadMaxMinTmpSize(srcShape, typeSize, maxValue, minValue);
-}
-
 TEST_F(TestTiling, TestLayernormGradTiling)
 {
     const uint32_t stackBufferSize = 100 * 1024;
@@ -2431,23 +1665,6 @@ TEST_F(TestTiling, TestLayernormGradBetaTiling)
     EXPECT_EQ(tiling.get_stackBufferSize(), stackBufferSize / sizeof(float));
 }
 
-TEST_F(TestTiling, TestConfusionTransposeTiling)
-{
-    const uint32_t stackBufferSize = 100 * 1024;
-    const uint32_t typeSize = 2;
-
-    std::vector<int64_t> shapeDims = { 1, 2, 64, 32 };
-    auto srcShape = ge::Shape(shapeDims);
-    optiling::ConfusionTransposeTiling tiling;
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 1, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 2, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 3, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 4, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 5, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 6, tiling);
-    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, typeSize, 7, tiling);
-}
-
 TEST_F(TestTiling, TestMatmulApiTilngL0BNoDB)
 {
     MatmulApiTiling tiling;
@@ -2784,129 +2001,6 @@ TEST_F(TestTiling, TestMatmulApiTilngSetShapeZero)
     EXPECT_EQ(ret, -1);
 }
 
-// #if __CCE_AICORE__ == 200
-// TEST_F(TestTiling, TestPlatformAscendC)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-//     EXPECT_EQ(plat.GetCoreNumVector(), 8);
-//     EXPECT_EQ(plat.GetCoreNumVector() + plat.GetCoreNumAic() , 18);
-// }
-// #endif
-
-// #if __CCE_AICORE__ == 220
-// extern void platfrom_stub_set_num_aic(const char *num);
-// extern void platfrom_stub_set_num_aiv(const char *num);
-// extern void platfrom_stub_set_num_cub(const char *num);
-// extern void platfrom_stub_set_ctl(const char *num);
-// extern void platfrom_stub_set_chip_version(const char *num);
-// extern void platfrom_stub_set_num(uint32_t num);
-// TEST_F(TestTiling, TestPlatformAscendC)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-//     uint64_t ub_size, l1_size, l0;
-//     uint64_t l2_bw, hbm_bw, bw;
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size);
-//     EXPECT_EQ(ub_size, 196352);
-//     EXPECT_EQ(l1_size, 524032);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0);
-//     EXPECT_EQ(l0, 65536);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0);
-//     EXPECT_EQ(l0, 65536);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0);
-//     EXPECT_EQ(l0, 65536 * 2);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw);
-//     EXPECT_EQ(l2_bw, 110);
-//     EXPECT_EQ(hbm_bw, 32);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw);
-//     EXPECT_EQ(plat.GetCoreNum(), 48);
-//     EXPECT_EQ(plat.GetCoreNumAic(), 24);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 48);
-//     platfrom_stub_set_num_cub("20");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 20);
-//     platfrom_stub_set_num_aiv("40");
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 40);
-//     platfrom_stub_set_ctl("AICore");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 24);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 24);
-//     platfrom_stub_set_num_aic("20");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 20);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 20);
-//     EXPECT_EQ(bw, 0);
-//     EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2);
-
-//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 16 * 1024 * 1024);
-//     platfrom_stub_set_chip_version("Ascend910");
-//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2 * 1024 * 1024);
-//     EXPECT_EQ(plat.GetSocVersion(), platform_ascendc::SocVersion::ASCEND910);
-//     EXPECT_EQ(plat.GetCoreNumVector(), 0);
-// }
-// #endif
-
-// #if __CCE_AICORE__ == 300
-// extern void platfrom_stub_set_num_aic(const char *num);
-// extern void platfrom_stub_set_num_aiv(const char *num);
-// extern void platfrom_stub_set_num_cub(const char *num);
-// extern void platfrom_stub_set_ctl(const char *num);
-// extern void platfrom_stub_set_chip_version(const char *num);
-// extern void platfrom_stub_set_num(uint32_t num);
-// TEST_F(TestTiling, TestPlatformAscendC)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-//     uint64_t ub_size, l1_size, l0;
-//     uint64_t l2_bw, hbm_bw, bw;
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub_size);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L1, l1_size);
-//     EXPECT_EQ(ub_size, 248 * 1024);
-//     EXPECT_EQ(l1_size, 1024 * 1024);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, l0);
-//     EXPECT_EQ(l0, 65536);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, l0);
-//     EXPECT_EQ(l0, 65536);
-//     plat.GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, l0);
-//     EXPECT_EQ(l0, 65536 * 2);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::L2, l2_bw);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::HBM, hbm_bw);
-//     EXPECT_EQ(l2_bw, 256);
-//     EXPECT_EQ(hbm_bw, 17);
-//     plat.GetCoreMemBw(platform_ascendc::CoreMemType::UB, bw);
-//     EXPECT_EQ(plat.GetCoreNum(), 1);
-//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
-//     platfrom_stub_set_num_cub("1");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
-//     platfrom_stub_set_num_aiv("1");
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
-//     platfrom_stub_set_ctl("AICore");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 1);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 1);
-//     platfrom_stub_set_num_aic("2");
-//     EXPECT_EQ(plat.GetCoreNumAic(), 2);
-//     EXPECT_EQ(plat.GetCoreNumAiv(), 2);
-//     EXPECT_EQ(bw, 0);
-//     EXPECT_EQ(plat.CalcTschBlockDim(1, 0, 1), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(1, 1, 0), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 1), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 2, 1), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(2, 1, 2), 1);
-//     EXPECT_EQ(plat.CalcTschBlockDim(3, 1, 2), 2);
-//     EXPECT_EQ(plat.CalcTschBlockDim(6, 1, 3), 2);
-
-//     EXPECT_EQ(plat.GetLibApiWorkSpaceSize(), 2097152);
-//     EXPECT_EQ(plat.GetCoreNumVector(), 0);
-// }
-// #endif
-
 TEST_F(TestTiling, TestMatmulApiTilngInt8Case1)
 {
     MatmulApiTiling tiling;
@@ -3185,60 +2279,6 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case9)
     EXPECT_EQ(ret, 0);
 }
 
-TEST_F(TestTiling, TestErfTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto erfShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetErfMaxMinTmpSize(erfShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
-    EXPECT_EQ(minValue, 256 * 3);
-}
-
-TEST_F(TestTiling, TestErfTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto erfShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetErfMaxMinTmpSize(erfShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 8);
-    EXPECT_EQ(minValue, 256 * 8);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetErfTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 8);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestErfcTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto erfcShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetErfcMaxMinTmpSize(erfcShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 7 * 4);
-    EXPECT_EQ(minValue, 256 * 7);
-}
-
-TEST_F(TestTiling, TestErfcTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto erfcShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetErfcMaxMinTmpSize(erfcShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 16);
-    EXPECT_EQ(minValue, 256 * 16);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetErfcTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 16);
-    EXPECT_EQ(extraBuf, 0);
-}
-
 TEST_F(TestTiling, TestMatmulApiTilngInt8Case10)
 {
     MultiCoreMatmulTiling tiling;
@@ -3317,95 +2357,6 @@ TEST_F(TestTiling, TestMatmulApiTilngInt8Case13)
     EXPECT_EQ(ret, 0);
 }
 
-TEST_F(TestTiling, TestCoshTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto coshShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestCoshTilingFloat512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto coshShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCoshMaxMinTmpSize(coshShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 512 * 4 * 2);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestCoshTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto coshShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCoshMaxMinTmpSize(coshShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 6);
-    EXPECT_EQ(minValue, 256 * 6);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetCoshTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 6);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestSinTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto sinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetSinMaxMinTmpSize(sinShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 2 * 256);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
-    GetSinMaxMinTmpSize(sinShape, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 3 * 256);
-    EXPECT_EQ(maxValue, 128 * 128 * 3 * 4);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetSinTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestSinTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto sinShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetSinMaxMinTmpSize(sinShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 8 * 2);
-    EXPECT_EQ(minValue, 8 * 256);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetSinTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 8);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAscendSumTiling)
-{
-    uint32_t n = 8;
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetSumMaxMinTmpSize(n, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 32);
-    EXPECT_EQ(maxValue, 32);
-
-    maxValue = 0;
-    minValue = 0;
-    GetSumMaxMinTmpSize(n, 4, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 32);
-    EXPECT_EQ(maxValue, 32);
-}
-
 TEST_F(TestTiling, TestAscendSiluTiling)
 {
     std::vector<int64_t> shapeDims = { 512 };
@@ -3428,54 +2379,6 @@ TEST_F(TestTiling, TestAscendSwishTiling)
     EXPECT_EQ(maxValue, 0);
 }
 
-TEST_F(TestTiling, TestAscendXorTiling)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto xorShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetXorMaxMinTmpSize(xorShape, 2, true, maxValue, minValue);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 2);
-    EXPECT_EQ(minValue, 1 * 256);
-    GetXorTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestFracTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto fracShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFracMaxMinTmpSize(fracShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 0);
-    EXPECT_EQ(maxValue, 0);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetFracTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 0);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestFracTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto fracShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFracMaxMinTmpSize(fracShape, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 1024);
-    EXPECT_EQ(maxValue, 131072);
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetFracTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 0);
-}
-
 #if __CCE_AICORE__ == 220
 extern void platfrom_stub_set_chip_version(const char *num);
 TEST_F(TestTiling, TestTopkTiling_TopKModeNomal_isInitIndexTrue_Float_Inner64)
@@ -3958,15 +2861,6 @@ TEST_F(TestTiling, TestTopkTiling_TopKModeSmall310P_HALF)
 }
 #endif
 
-TEST_F(TestTiling, TestArithProgression)
-{
-    uint32_t maxValue;
-    uint32_t minValue;
-    GetArithProgressionMaxMinTmpSize(maxValue, minValue);
-    EXPECT_EQ(maxValue, 0);
-    EXPECT_EQ(minValue, 0);
-}
-
 TEST_F(TestTiling, TestGeGLUTilingFloat)
 {
     std::vector<int64_t> shapeDims = { 128, 128 };
@@ -4001,566 +2895,6 @@ TEST_F(TestTiling, TestGeGLUTilingHalf)
     EXPECT_EQ(extraBuf, 0);
 }
 
-TEST_F(TestTiling, TestLgammaTilingFp32)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto shape = ge::Shape(shapeDims);
-    uint32_t maxSize;
-    uint32_t minSize;
-    GetLgammaMaxMinTmpSize(shape, 4, true, maxSize, minSize);
-    EXPECT_EQ(maxSize, 458752);
-    EXPECT_EQ(minSize, 1792);
-
-    GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 524288);
-    EXPECT_EQ(minSize, 2048);
-
-    shapeDims = { 8 };
-    shape = ge::Shape(shapeDims);
-    GetLgammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 2048);
-    EXPECT_EQ(minSize, 2048);
-
-    GetLgammaMaxMinTmpSize(shape, 4, true,maxSize, minSize);
-    EXPECT_EQ(maxSize, 1792);
-    EXPECT_EQ(minSize, 1792);
-
-    uint32_t maxLiveNodeCnt = 0xffff;
-    uint32_t extraBuf = 0xffff;
-    GetLgammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 8);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestLgammaTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto shape = ge::Shape(shapeDims);
-    uint32_t maxSize;
-    uint32_t minSize;
-
-    GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 128 * 128 * 2 * 13 * 2);
-    EXPECT_EQ(minSize, 13 * 2 * 256);
-
-    shapeDims = { 8 };
-    shape = ge::Shape(shapeDims);
-    GetLgammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 256 * 13 * 2);
-    EXPECT_EQ(minSize, 256 * 13 * 2);
-
-    uint32_t maxLiveNodeCnt = 0xffff;
-    uint32_t extraBuf = 0xffff;
-    GetLgammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 13);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestDigammaTilingFp32)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto shape = ge::Shape(shapeDims);
-    uint32_t maxSize;
-    uint32_t minSize;
-    GetDigammaMaxMinTmpSize(shape, 4, true, maxSize, minSize);
-    EXPECT_EQ(maxSize, 393216);
-    EXPECT_EQ(minSize, 1536);
-
-    GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 458752);
-    EXPECT_EQ(minSize, 1792);
-
-    shapeDims = { 8 };
-    shape = ge::Shape(shapeDims);
-    GetDigammaMaxMinTmpSize(shape, 4, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 1792);
-    EXPECT_EQ(minSize, 1792);
-
-    GetDigammaMaxMinTmpSize(shape, 4, true,maxSize, minSize);
-    EXPECT_EQ(maxSize, 1536);
-    EXPECT_EQ(minSize, 1536);
-
-    uint32_t maxLiveNodeCnt = 0xffff;
-    uint32_t extraBuf = 0xffff;
-    GetDigammaTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 7);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestDigammaTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto shape = ge::Shape(shapeDims);
-    uint32_t maxSize;
-    uint32_t minSize;
-
-    GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 128 * 128 * 2 * 8 * 2);
-    EXPECT_EQ(minSize, 8 * 2 * 256);
-
-    shapeDims = { 8 };
-    shape = ge::Shape(shapeDims);
-    GetDigammaMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 256 * 8 * 2);
-    EXPECT_EQ(minSize, 256 * 8 * 2);
-
-    uint32_t maxLiveNodeCnt = 0xffff;
-    uint32_t extraBuf = 0xffff;
-    GetDigammaTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 16);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAtanhTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto aTanhShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 4 * 1);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetAtanhTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAtanhTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto aTanhShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    AscendC::GetAtanhMaxMinTmpSize(aTanhShape, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 4);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    AscendC::GetAtanhTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 4);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestSignTiling)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto signShape = ge::Shape(shapeDims);
-    uint32_t signNeedMaxSize;
-    uint32_t signNeedMinSize;
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetSignMaxMinTmpSize(signShape, 2, false, signNeedMaxSize, signNeedMinSize);
-    EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 2);
-    EXPECT_EQ(signNeedMinSize, 3 * 256);
-
-    GetSignMaxMinTmpSize(signShape, 4, false, signNeedMaxSize, signNeedMinSize);
-    EXPECT_EQ(signNeedMaxSize, 3 * 128 * 128 * 4);
-    EXPECT_EQ(signNeedMinSize, 3 * 256);
-
-    GetSignTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 3);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestAscendMeanTiling)
-{
-    uint32_t n = 8;
-    uint32_t maxValue;
-    uint32_t minValue;
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-
-    GetMeanMaxMinTmpSize(n, 2, 2, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 32);
-    EXPECT_EQ(maxValue, 32);
-
-    maxValue = 0;
-    minValue = 0;
-    GetMeanMaxMinTmpSize(n, 4, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 32);
-    EXPECT_EQ(maxValue, 32);
-
-    GetMeanMaxMinTmpSize(n, 2, 4, true, maxValue, minValue);
-    EXPECT_EQ(minValue, 96);
-    EXPECT_EQ(maxValue, 96);
-
-    GetMeanTmpBufferFactorSize(4, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-// TEST_F(TestTiling, TestKernelContextBuildBase)
-// {
-//     auto builder = context_ascendc::BuildKernelRunContext(2, 2);
-//     EXPECT_EQ(builder.kernelInputNum, 2);
-// }
-
-
-// TEST_F(TestTiling, TestKernelContextBuild)
-// {
-//     gert::Shape input1_shape = {2, 1, 1, 1, 1, 1, 1, 2, 2};
-//     int32_t input1_tensor_buffer[] = {0, 2, 3, 3, 1, 0, 0, 1};
-//     gert::TensorData input1_tensor_data{(void*)input1_tensor_buffer, nullptr};
-//     gert::Shape output_shape = {5, 3};
-//     int64_t output_tensor_buffer[15];
-//     gert::TensorData output_tensor_data{(void*)output_tensor_buffer, nullptr};
-//     auto kernelHolder =
-//         context_ascendc::KernelRunContextBuilder()
-//             .KernelIONum(2, 2)
-//             .Inputs({reinterpret_cast<void*>(&input1_shape),
-//                     reinterpret_cast<void*>(&input1_tensor_data)})
-//             .Outputs({reinterpret_cast<void*>(&output_shape), reinterpret_cast<void*>(&output_tensor_data)})
-//             .NodeIoNum(1, 1)
-//             .IrInputNum(1)
-//             .NodeInputTd(0, ge::DT_INT32, ge::FORMAT_ND, ge::FORMAT_ND)
-//             .NodeOutputTd(0, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND)
-//             .Build();
-//     auto context = kernelHolder.GetContext<gert::KernelContext>();
-//     EXPECT_NE(context, nullptr);
-// }
-
-// TEST_F(TestTiling, TestTilingContextBuildWithConstValue)
-// {
-//     string active_type = "gelu";
-//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
-//     gert::StorageShape expert_tokens_shape = {{16}, {16}};
-//     gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}};
-//     gert::StorageShape bias1_shape = {{16, 0}, {16, 0}};
-//     gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}};
-//     gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}};
-
-//     gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}};
-
-//     std::vector<int64_t> expert_tokens_const_value (16, 1);
-//     std::vector<float> x_const_value (1024 * 5120, 2.f);
-//     std::vector<float> bias2_value (16 * 5120, 3.f);
-//     auto param = gert::TilingData::CreateCap(4096);
-//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
-//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
-//     auto holder = context_ascendc::TilingContextBuilder()
-//                                 .SetOpNameType("name", "tpye")
-//                                 .NodeIoNum(6, 1)
-//                                 .IrInstanceNum({1, 1, 1, 1, 1, 1})
-//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
-//                                 .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape)
-//                                 .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape)
-//                                 .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, reinterpret_cast<void *>(expert_tokens_const_value.data()))
-//                                 .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape)
-//                                 .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast<void*>(bias2_value.data()))
-//                                 .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape)
-//                                 .AddAttrs({
-//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
-//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
-//                                 })
-//                                 .TilingData(param.get())
-//                                 .Workspace(ws_size)
-//                                 .Build();
-
-//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
-//     EXPECT_NE(tiling_context, nullptr);
-
-// }
-
-// TEST_F(TestTiling, TestTilingContextBuildAddInputs)
-// {
-//     string active_type = "gelu";
-//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
-//     std::vector<context_ascendc::TensorInfo> inputs;
-//     std::vector<context_ascendc::TensorInfo> outputs;
-//     context_ascendc::TensorInfo input;
-//     input.shape = x_shape;
-//     input.dType = ge::DT_FLOAT16;
-//     input.oriFormat = ge::FORMAT_ND;
-//     input.format = ge::FORMAT_ND;
-//     input.dataPath = "1111";
-//     inputs.push_back(input);
-//     context_ascendc::TensorInfo output;
-//     output.shape = x_shape;
-//     output.dType = ge::DT_FLOAT16;
-//     output.oriFormat = ge::FORMAT_ND;
-//     output.format = ge::FORMAT_ND;
-//     output.dataPath = "222";
-//     outputs.push_back(output);
-
-//     auto param = gert::TilingData::CreateCap(4096);
-//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
-//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
-//     auto holder = context_ascendc::TilingContextBuilder()
-//                                 .SetOpNameType("name", "tpye")
-//                                 .NodeIoNum(1, 1)
-//                                 .IrInstanceNum({1})
-//                                 .AddInputs(inputs)
-//                                 .AddOutputs(outputs)
-//                                 .AddAttrs({
-//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
-//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
-//                                 })
-//                                 .TilingData(param.get())
-//                                 .Workspace(ws_size)
-//                                 .Build();
-
-//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
-//     EXPECT_NE(tiling_context, nullptr);
-// }
-
-// TEST_F(TestTiling, TestTilingContextBuildFailed)
-// {
-//     string active_type = "gelu";
-//     gert::StorageShape x_shape = {{-1, 5120}, {-1, 5120}};
-//     std::vector<float> x_const_value (1024 * 5120, 2.f);
-//     auto param = gert::TilingData::CreateCap(4096);
-//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
-//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
-//     auto holder = context_ascendc::TilingContextBuilder()
-//                                 .NodeIoNum(1, 1)
-//                                 .IrInstanceNum({1, 1})
-//                                 .CompileInfo(nullptr)
-//                                 .PlatformInfo(nullptr)
-//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
-//                                 .Workspace(ws_size)
-//                                 .Build();
-
-//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
-//     EXPECT_EQ(tiling_context, nullptr);
-// }
-
-// TEST_F(TestTiling, TestTilingContextBuildWithBinFile)
-// {
-//     string active_type = "gelu";
-//     gert::StorageShape x_shape = {{1024, 5120}, {1024, 5120}};
-//     gert::StorageShape expert_tokens_shape = {{16}, {16}};
-//     gert::StorageShape weight1_shape = {{16, 5120, 0}, {16, 5120, 0}};
-//     gert::StorageShape bias1_shape = {{16, 0}, {16, 0}};
-//     gert::StorageShape weight2_shape = {{16, 0, 5120}, {16, 0, 5120}};
-//     gert::StorageShape bias2_shape = {{16, 5120}, {16, 5120}};
-//     gert::StorageShape output_shape = {{1024, 5210}, {1024, 5210}};
-
-//     std::vector<int64_t> expert_tokens_const_value (16, 1);
-
-//     std::vector<float> x_const_value (1024 * 5120, 2.f);
-//     std::vector<float> bias2_value (16 * 5120, 3.f);
-//     auto param = gert::TilingData::CreateCap(4096);
-//     auto workspace_size_holer = gert::ContinuousVector::Create<size_t>(4096);
-//     auto ws_size = reinterpret_cast<gert::ContinuousVector *>(workspace_size_holer.get());
-//     auto holder = context_ascendc::TilingContextBuilder()
-//                                 .SetOpNameType("name", "tpye")
-//                                 .NodeIoNum(6, 1)
-//                                 .IrInstanceNum({1, 1, 1, 1, 1, 1})
-//                                 .AddInputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &x_shape, reinterpret_cast<void *>(x_const_value.data()))
-//                                 .AddInputTd(1, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight1_shape)
-//                                 .AddInputTd(2, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &weight2_shape)
-//                                 .AddInputTd(3, ge::DT_INT64, ge::FORMAT_ND, ge::FORMAT_ND, &expert_tokens_shape, "./expert_tokens_data.bin")
-//                                 .AddInputTd(4, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &bias1_shape)
-//                                 .AddInputTd(5, ge::DT_BF16, ge::FORMAT_ND, ge::FORMAT_ND, &bias2_shape, reinterpret_cast<void*>(bias2_value.data()))
-//                                 .AddOutputTd(0, ge::DT_FLOAT16, ge::FORMAT_ND, ge::FORMAT_ND, &output_shape)
-//                                 .AddAttrs({
-//                                 {"activation", ge::AnyValue::CreateFrom<string>(active_type)},
-//                                 {"inner_precise", ge::AnyValue::CreateFrom<int64_t>(1)}
-//                                 })
-//                                 .TilingData(param.get())
-//                                 .Workspace(ws_size)
-//                                 .Build();
-//     gert::TilingContext* tiling_context = holder.GetContext<gert::TilingContext>();
-//     EXPECT_EQ(tiling_context, nullptr);
-// }
-
-TEST_F(TestTiling, TestAxpyTiling)
-{
-    uint32_t maxVal = 0;
-    uint32_t minVal = 0;
-    GetAxpyMaxMinTmpSize(ge::Shape({128}), 4, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 0);
-    EXPECT_EQ(minVal, 0);
-    GetAxpyMaxMinTmpSize(ge::Shape({256}), 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 4 * 2);
-    EXPECT_EQ(minVal, 256 * 4);
-    GetAxpyMaxMinTmpSize(ge::Shape({32}), 2, false, maxVal, minVal);
-    EXPECT_EQ(maxVal, 256 * 4);
-    EXPECT_EQ(minVal, 256 * 4);
-    uint32_t extraBuf = 123;
-    uint32_t maxLivedNodesCnt = 123;
-    GetAxpyTmpBufferFactorSize(4, maxLivedNodesCnt, extraBuf);
-    EXPECT_EQ(extraBuf, 0);
-    EXPECT_EQ(maxLivedNodesCnt, 1);
-    GetAxpyTmpBufferFactorSize(2, maxLivedNodesCnt, extraBuf);
-    EXPECT_EQ(extraBuf, 0);
-    EXPECT_EQ(maxLivedNodesCnt, 4);
-}
-
-TEST_F(TestTiling, TestCeilTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto ceilShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCeilMaxMinTmpSize(ceilShape, sizeof(float), false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 1);
-    EXPECT_EQ(maxValue, 128 * 128 * 1 * 4);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetCeilTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 1);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestCeilTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto ceilShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCeilMaxMinTmpSize(ceilShape, 2, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetCeilTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 2);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestCeilTilingHalf512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto ceilShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetCeilMaxMinTmpSize(ceilShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 512 * 2 * 2);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-TEST_F(TestTiling, TestFloorTilingFloat)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto floorShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFloorMaxMinTmpSize(floorShape, sizeof(float), false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 0);
-    EXPECT_EQ(maxValue, 0);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetFloorTmpBufferFactorSize(sizeof(float), maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 0);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestFloorTilingHalf)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto floorShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFloorMaxMinTmpSize(floorShape, 2, false,  maxValue, minValue);
-    EXPECT_EQ(minValue, 256 * 2);
-    EXPECT_EQ(maxValue, 128 * 128 * 2 * 2);
-
-    uint32_t maxLiveNodeCnt;
-    uint32_t extraBuf;
-    GetFloorTmpBufferFactorSize(2, maxLiveNodeCnt, extraBuf);
-    EXPECT_EQ(maxLiveNodeCnt, 2);
-    EXPECT_EQ(extraBuf, 0);
-}
-
-TEST_F(TestTiling, TestFloorTilingHalf512)
-{
-    std::vector<int64_t> shapeDims = { 512 };
-    auto floorShape = ge::Shape(shapeDims);
-    uint32_t maxValue = 0;
-    uint32_t minValue = 0;
-    GetFloorMaxMinTmpSize(floorShape, 2, false, maxValue, minValue);
-    EXPECT_EQ(maxValue, 512 * 2 * 2);
-    EXPECT_EQ(minValue, 256 * 2);
-}
-
-// TEST_F(TestTiling, TestGetSocVersion)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-
-//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
-//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
-//         .stubs()
-//         .will(returnValue(false));
-
-//     platform_ascendc::SocVersion ret = plat.GetSocVersion();
-//     EXPECT_EQ(ret, platform_ascendc::SocVersion::RESERVED_VERSION);
-// }
-
-// TEST_F(TestTiling, TestCoreNum)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-
-//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
-//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
-//         .stubs()
-//         .will(returnValue(false));
-
-//     uint32_t ret1 = plat.GetCoreNumAic();
-//     uint32_t ret2 = plat.GetCoreNumAiv();
-//     EXPECT_EQ(ret1, 0);
-//     EXPECT_EQ(ret2, 0);
-// }
-
-// TEST_F(TestTiling, TestGetLibApiWorkSpaceSize)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-
-//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
-//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
-//         .stubs()
-//         .will(returnValue(false));
-
-//     uint32_t ret1 = plat.GetLibApiWorkSpaceSize();
-//     EXPECT_EQ(ret1, static_cast<uint32_t>(-1));
-// }
-// TEST_F(TestTiling, TestPlatformAscendCManager)
-// {
-//     void *handle;
-//     int a = 7;
-//     handle = &a;
-
-//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
-//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
-//         .stubs()
-//         .will(returnValue(false));
-
-//     auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance();
-// }
-
-// TEST_F(TestTiling, TestGetVectorCoreNum)
-// {
-//     fe::PlatFormInfos platform_info;
-//     auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-
-//     MOCKER_CPP(&fe::PlatFormInfos::GetPlatformResWithLock,
-//         bool(fe::PlatFormInfos::*)(const std::string &, const std::string &, std::string &))
-//         .stubs()
-//         .will(returnValue(false));
-//     MOCKER_CPP(&platform_ascendc::PlatformAscendC::GetSocVersion,
-//         platform_ascendc::SocVersion(platform_ascendc::PlatformAscendC::*)(void) const)
-//         .stubs()
-//         .will(returnValue(platform_ascendc::SocVersion::ASCEND310P));
-
-//     uint32_t ret1 = plat.GetCoreNumVector();
-//     EXPECT_EQ(ret1, static_cast<uint32_t>(0));
-//     MOCKER_CPP(&platform_ascendc::PlatformAscendCManager::PlatformAscendCInit)
-//         .stubs()
-//         .will(returnValue(platform_info));
-//     auto ret2 = platform_ascendc::PlatformAscendCManager::GetInstance();
-
-// }
-
 TEST_F(TestTiling, TestReGluFloat16OrBf16)
 {
     const std::vector<int64_t> srcShapeDims = { 8, 128 };
@@ -4581,234 +2915,4 @@ TEST_F(TestTiling, TestReGluFloat32)
     GetReGluMaxMinTmpSize(srcShape, 4, false, maxValue, minValue);
     EXPECT_EQ(minValue, 256);
     EXPECT_EQ(maxValue, 256);
-}
-
-#if __CCE_AICORE__ == 220
-extern void platfrom_stub_set_chip_version(const char *num);
-TEST_F(TestTiling, TestBroadCast220)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    platfrom_stub_set_chip_version("Ascend910B");
-    uint32_t firstDim = 32;
-    uint32_t lastDim = 32;
-    std::vector<int64_t> srcShapeDims = {firstDim, 1};
-    auto srcShape = ge::Shape(srcShapeDims);
-    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
-    auto dstShape = ge::Shape(dstShapeDims);
-    uint32_t maxValue{0};
-    uint32_t minValue{0};
-    constexpr uint32_t halfSize = 2;
-    constexpr uint32_t halfOneBlockElementNum = 16;
-    constexpr uint32_t minHalfAlignSize = halfOneBlockElementNum * halfOneBlockElementNum * halfSize;
-    constexpr uint32_t BRCB_ONE_SIZE = 8;
-    uint32_t firstDimAlignNum = (firstDim + BRCB_ONE_SIZE - 1) / BRCB_ONE_SIZE * BRCB_ONE_SIZE;
-    uint32_t maxHalfAlignSize = firstDimAlignNum * halfOneBlockElementNum * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfAlignSize);
-    EXPECT_EQ(maxValue, maxHalfAlignSize);
-
-    srcShapeDims = {firstDim, 1};
-    srcShape = ge::Shape(srcShapeDims);
-    uint32_t lastDimNotAlign = 31;
-    dstShapeDims = {firstDim, lastDimNotAlign};
-    dstShape = ge::Shape(dstShapeDims);
-
-    uint32_t blockDimAlignBlockNum = (lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum;
-    uint32_t blockDimAlign = blockDimAlignBlockNum * halfOneBlockElementNum;
-    uint32_t minCopyTempBufferSize = halfOneBlockElementNum * blockDimAlign * halfSize;
-    auto minHalfNotAlignSize = minHalfAlignSize + minCopyTempBufferSize;
-
-    uint32_t maxCopyTempBufferSize = firstDim * blockDimAlign * halfSize;
-    uint32_t maxHalfNotAlignValue = maxHalfAlignSize + maxCopyTempBufferSize;
-
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfNotAlignSize);
-    EXPECT_EQ(maxValue, maxHalfNotAlignValue);
-
-    constexpr uint32_t int8Size = 1;
-    srcShapeDims = {firstDim, 1};
-    srcShape = ge::Shape(srcShapeDims);
-    dstShapeDims = {firstDim, lastDim};
-    dstShape = ge::Shape(dstShapeDims);
-    const uint32_t alignSrcSize =
-        ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
-    uint32_t alignDstSize =
-        ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
-    uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize);
-    EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize);
-
-    srcShapeDims = {firstDim, 1};
-    srcShape = ge::Shape(srcShapeDims);
-    dstShapeDims = {firstDim, lastDimNotAlign};
-    dstShape = ge::Shape(dstShapeDims);
-    alignDstSize =
-        ((firstDim * lastDimNotAlign + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
-    castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfNotAlignSize + castTempBufferSize);
-    EXPECT_EQ(maxValue, maxHalfNotAlignValue + castTempBufferSize);
-}
-#endif
-
-#if __CCE_AICORE__ == 200
-extern void platfrom_stub_set_chip_version(const char *num);
-TEST_F(TestTiling, TestLastBroadCast200)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    platfrom_stub_set_chip_version("Ascend310P");
-    uint32_t firstDim = 32;
-    uint32_t lastDim = 32;
-    std::vector<int64_t> srcShapeDims = {firstDim, 1};
-    auto srcShape = ge::Shape(srcShapeDims);
-    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
-    auto dstShape = ge::Shape(dstShapeDims);
-    uint32_t maxValue{0};
-    uint32_t minValue{0};
-    constexpr uint32_t halfSize = 2;
-    constexpr uint32_t halfOneBlockElementNum = 16;
-    constexpr uint32_t MAX_BLOCK_NUM = 8;
-    constexpr uint32_t ONE_BLOCK_SIZE = 32;
-    uint32_t minTmpBufferSize =
-            halfOneBlockElementNum * ((lastDim + MAX_BLOCK_NUM - 1) / MAX_BLOCK_NUM) * halfSize;
-    uint32_t minHalfAlignSize = ONE_BLOCK_SIZE + + minTmpBufferSize;
-    uint32_t maxHalfAlignSize = ONE_BLOCK_SIZE + firstDim * lastDim * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfAlignSize);
-    EXPECT_EQ(maxValue, maxHalfAlignSize);
-
-    constexpr uint32_t int8Size = 1;
-    const uint32_t alignSrcSize =
-        ((firstDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
-    const uint32_t alignDstSize =
-        ((firstDim * lastDim + halfOneBlockElementNum - 1) / halfOneBlockElementNum) * halfOneBlockElementNum;
-    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfAlignSize + castTempBufferSize);
-    EXPECT_EQ(maxValue, maxHalfAlignSize + castTempBufferSize);
-}
-
-TEST_F(TestTiling, TestFirstBroadCast200)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    platfrom_stub_set_chip_version("Ascend310P");
-    uint32_t firstDim = 32;
-    uint32_t lastDim = 32;
-    std::vector<int64_t> srcShapeDims = {1, lastDim};
-    auto srcShape = ge::Shape(srcShapeDims);
-    std::vector<int64_t> dstShapeDims = {firstDim, lastDim};
-    auto dstShape = ge::Shape(dstShapeDims);
-    uint32_t maxValue{0};
-    uint32_t minValue{0};
-    constexpr uint32_t halfSize = 2;
-    constexpr uint32_t ONE_BLOCK_SIZE = 32;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
-    EXPECT_EQ(minValue, ONE_BLOCK_SIZE);
-    EXPECT_EQ(maxValue, ONE_BLOCK_SIZE);
-
-    constexpr uint32_t int8Size = 1;
-    constexpr uint32_t HALF_ONE_BLK_SIZE = 16;
-    const uint32_t alignSrcSize = ((lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
-    const uint32_t alignDstSize =
-        ((firstDim * lastDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
-    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
-    EXPECT_EQ(minValue, ONE_BLOCK_SIZE + castTempBufferSize);
-    EXPECT_EQ(maxValue, ONE_BLOCK_SIZE + castTempBufferSize);
-}
-
-TEST_F(TestTiling, TestOneElementBroadCast200)
-{
-    fe::PlatFormInfos platform_info;
-    auto plat = platform_ascendc::PlatformAscendC(&platform_info);
-    platfrom_stub_set_chip_version("Ascend310P");
-    uint32_t srcDim = 1;
-    uint32_t dstDim = 32;
-    std::vector<int64_t> srcShapeDims = {srcDim};
-    auto srcShape = ge::Shape(srcShapeDims);
-    std::vector<int64_t> dstShapeDims = {dstDim};
-    auto dstShape = ge::Shape(dstShapeDims);
-    uint32_t maxValue{0};
-    uint32_t minValue{0};
-    constexpr uint32_t halfSize = 2;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, halfSize, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 0);
-    EXPECT_EQ(maxValue, 0);
-
-    constexpr uint32_t int8Size = 1;
-    constexpr uint32_t HALF_ONE_BLK_SIZE = 16;
-    constexpr uint32_t ONE_BLOCK_SIZE = 32;
-    const uint32_t alignSrcSize = ((srcDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
-    const uint32_t alignDstSize = ((dstDim + HALF_ONE_BLK_SIZE - 1) / HALF_ONE_BLK_SIZE) * HALF_ONE_BLK_SIZE;
-    const uint32_t castTempBufferSize = (alignSrcSize + alignDstSize) * halfSize;
-    GetBroadCastMaxMinTmpSize(plat, srcShape, dstShape, int8Size, false, maxValue, minValue);
-    EXPECT_EQ(minValue, castTempBufferSize + ONE_BLOCK_SIZE);
-    EXPECT_EQ(maxValue, castTempBufferSize + ONE_BLOCK_SIZE);
-}
-#endif
-
-TEST_F(TestTiling, TestReduceXorSumTilingInt16)
-{
-    std::vector<int64_t> shapeDims = { 128, 128 };
-    auto shape = ge::Shape(shapeDims);
-    uint32_t maxSize;
-    uint32_t minSize;
-    GetReduceXorSumMaxMinTmpSize(shape, 2, true, maxSize, minSize);
-    EXPECT_EQ(maxSize, 65536);
-    EXPECT_EQ(minSize, 65536);
-
-    GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 98304);
-    EXPECT_EQ(minSize, 98304);
-
-    shapeDims = { 8 };
-    shape = ge::Shape(shapeDims);
-    GetReduceXorSumMaxMinTmpSize(shape, 2, false, maxSize, minSize);
-    EXPECT_EQ(maxSize, 768);
-    EXPECT_EQ(minSize, 768);
-
-    GetReduceXorSumMaxMinTmpSize(shape, 2, true,maxSize, minSize);
-    EXPECT_EQ(maxSize, 512);
-    EXPECT_EQ(minSize, 512);
-}
-
-TEST_F(TestTiling, TestCumSum)
-{
-    uint32_t firstDim = 32;
-    uint32_t lastDim = 16;
-    std::vector<int64_t> srcShapeDims = {firstDim, lastDim};
-    auto srcShape = ge::Shape(srcShapeDims);
-    uint32_t maxValue{0};
-    uint32_t minValue{0};
-    constexpr uint32_t halfSize = 2;
-    constexpr uint32_t transDataTo5HDAddrListSize = 16;
-    uint32_t minHalfSize = transDataTo5HDAddrListSize * lastDim * 3 * sizeof(uint16_t);
-    uint32_t alignOutter = (firstDim + transDataTo5HDAddrListSize - 1) / transDataTo5HDAddrListSize * transDataTo5HDAddrListSize;
-    uint32_t maxHalfSize = alignOutter * lastDim * 3 * sizeof(uint16_t);
-
-    GetCumSumMaxMinTmpSize(srcShape, halfSize, true, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfSize);
-    EXPECT_EQ(maxValue, maxHalfSize);
-
-    constexpr uint32_t floatSize = 4;
-    uint32_t minFloatSize = transDataTo5HDAddrListSize * lastDim * 2 * sizeof(float);
-    uint32_t maxFloatSize = alignOutter * lastDim * 2 * sizeof(float);
-
-    GetCumSumMaxMinTmpSize(srcShape, floatSize, true, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minFloatSize);
-    EXPECT_EQ(maxValue, maxFloatSize);
-
-    maxHalfSize = minHalfSize = firstDim * lastDim * sizeof(float);
-    GetCumSumMaxMinTmpSize(srcShape, halfSize, false, false, maxValue, minValue);
-    EXPECT_EQ(minValue, minHalfSize);
-    EXPECT_EQ(maxValue, maxHalfSize);
-
-
-    GetCumSumMaxMinTmpSize(srcShape, floatSize, false, false, maxValue, minValue);
-    EXPECT_EQ(minValue, 0);
-    EXPECT_EQ(maxValue, 0);
 }
\ No newline at end of file
-- 
Gitee


From aa7700dbbadea2170bc36b681041f294df62ca95 Mon Sep 17 00:00:00 2001
From: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
Date: Mon, 22 Jul 2024 07:49:14 +0000
Subject: [PATCH 8/8] add llt case

Signed-off-by: jiangchengcheng-on <jiangchengcheng3@h-partners.com>
---
 tests/tiling/test_tiling.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index 219b8605..966bb8ad 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -77,6 +77,25 @@ TEST_F(TestTiling, PlatformConstructor)
     EXPECT_EQ(ret, 0);
 }
 
+TEST_F(TestTiling, TestInt4BaseK)
+{
+    matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288,
+        .l0CSize = 131072, .ubSize = 196608, .l0ASize = 65536, .l0BSize = 65536};
+    MatmulApiTiling tiling(plat);
+    tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
+    tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT4);
+    tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32);
+    tiling.SetBias(false);
+    tiling.SetShape(144, 256, 32);
+    tiling.SetOrgShape(144, 256, 32);
+    tiling.SetBufferSpace(256 * 1024, 128 * 1024, -1);
+    optiling::TCubeTiling tilingData;
+    int ret = tiling.GetTiling(tilingData);
+    tiling.PrintTilingData();
+    EXPECT_EQ(tilingData.get_baseK() % 64, 0);
+    EXPECT_EQ(ret, 0);
+}
+
 TEST_F(TestTiling, Tiling_310p_NotAligned)
 {
     matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND310P, .l1Size = 1048576,
-- 
Gitee