diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index 7602832b7c0ec5c50e84bee1a53008e2d364145d..8372805ac6f926422b2fe9a851c3ed0338d0233f 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -654,10 +654,6 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto&
 __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::CheckTiling()
 {
 #ifdef ASCENDC_CPU_DEBUG
-    ASCENDC_ASSERT((var.tiling_.GetUsedCoreNum() > 0), {
-        KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetUsedCoreNum() is %d , which should be larger than 0",
-            var.tiling_.GetUsedCoreNum());
-    });
     ASCENDC_ASSERT((M_ > 0), { KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be larger than 0", M_); });
     ASCENDC_ASSERT((N_ > 0), { KERNEL_LOG(KERNEL_ERROR, "N_ is %d , which should be larger than 0", N_); });
     ASCENDC_ASSERT((Ka_ > 0), { KERNEL_LOG(KERNEL_ERROR, "Ka_ is %d , which should be larger than 0", Ka_); });
diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h
index 3432d06a528ac90d7a73cfb342715cd9739a2109..a31a2d0166238b4b4479f513a33ca5ab6577b428 100644
--- a/impl/matmul/matmul_server.h
+++ b/impl/matmul/matmul_server.h
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 /**
  * Copyright (c) 2024 Huawei Technologies Co., Ltd.
  * This file is a part of the CANN Open Software.
@@ -1062,4 +1063,1051 @@ public:
     __aicore__ inline MatmulServiceAux() {}
 };
 } // namespace matmul
+=======
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file matmul_server.h
+ * \brief
+ */
+#ifndef LIB_MATMUL_MATMUL_SERVER_H
+#define LIB_MATMUL_MATMUL_SERVER_H
+
+#include "../../lib/matmul/matmul.h"
+#include "kernel_operator.h"
+
+namespace matmul {
+constexpr uint16_t WORKSPACE_SYNC_ID = 15;
+using namespace AscendC;
+template <bool IS_IBSHARE> struct IBShareCache {
+    __aicore__ inline IBShareCache() {};
+};
+
+template <>
+struct IBShareCache<false> {
+    __aicore__ inline IBShareCache() {};
+    using ShareCache = uint16_t;
+};
+
+template <>
+struct IBShareCache<true> {
+    __aicore__ inline IBShareCache() {};
+    using ShareCache = GlobalCache;
+};
+template<class A_TYPE, class B_TYPE> __aicore__ constexpr bool IsIBShare()
+{
+    if (A_TYPE::ibShare == true) {
+        return true;
+    }
+    if (B_TYPE::ibShare == true) {
+        return true;
+    }
+    return false;
+}
+
+struct MatmulMsg {
+    uint32_t setOrgShape : 1;
+    uint32_t orgM;
+    uint32_t orgN;
+    uint32_t orgKa;
+    uint32_t orgKb;
+    uint32_t orgKc;
+};
+
+struct ShareMatmulBase {
+    __aicore__ inline ShareMatmulBase() {};
+};
+
+struct ShareMatmul : ShareMatmulBase {
+    __aicore__ inline ShareMatmul(){};
+    MatmulMsg msg0;
+    MatmulMsg msg1;
+};
+
+template <bool SHARED>
+struct ShareMatmulAux {
+    __aicore__ inline ShareMatmulAux(){};
+};
+
+template <>
+struct ShareMatmulAux<false> {
+    __aicore__ inline ShareMatmulAux(){};
+    using MSG = ShareMatmulBase;
+};
+
+template <>
+struct ShareMatmulAux<true> {
+    __aicore__ inline ShareMatmulAux(){};
+    using MSG = ShareMatmul;
+};
+
+__aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace)
+{
+    SetAtomicNone();
+#if __CCE_AICORE__ == 220
+    if ASCEND_IS_AIC {
+        SetMaskNorm();
+        SetLoadDataBoundary((uint64_t)0);
+        SetLoadDataPaddingValue((uint64_t)0);
+    } else {
+        AscendCUtils::SetMask<uint64_t>((uint64_t)-1, (uint64_t)-1);
+        SetMaskNorm();
+    }
+#endif
+
+#ifdef __DAV_C220_CUBE__
+    ClearWorkspaceImpl(workspace);
+    NotifyEvent<PIPE_MTE3>(WORKSPACE_SYNC_ID);
+#endif
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG = CFG_NORM,
+    class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
+class MatmulService {
+    using SrcAT = typename A_TYPE::T;
+    using SrcBT = typename B_TYPE::T;
+    using SrcT = typename A_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+
+public:
+    __aicore__ inline MatmulService() {}
+    __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace)
+    {
+        ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server");
+        ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server");
+        ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server");
+        ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server");
+        this->instID = instID;
+        this->kfcCommSrv = kfc;
+        this->tpipe = tpipe;
+        this->workspace = workspace;
+        mul.SetSubBlockIdx(kfcCommSrv->subBlockID);
+        if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
+            msgAux.msg0.setOrgShape = false;
+            msgAux.msg1.setOrgShape = false;
+        }
+        this->devEvtID = instID;
+        if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) {
+            if (kfcCommSrv->subBlockID == 0) {
+                gCache.Init();
+            }
+        }
+        using TILING_TYPE = typename std::remove_cv<typename std::remove_reference<decltype(MM_CFG)>::type>::type;
+        if constexpr (IsSameTypeV<TILING_TYPE, MatmulApiStaticTiling>) {
+            tiling_.SetTiling((TCubeTiling *)tiling);
+            mul.Init(tiling_.GetTiling(), nullptr);
+        } else if (tiling) {
+            tiling_.SetTiling((TCubeTiling *)tiling);
+            mul.Init(tiling_.GetTiling(), nullptr);
+        }
+    }
+
+    __aicore__ inline void Init(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
+            return;
+        } else {
+            ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server");
+            ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server");
+            auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr));
+            tiling_.SetTiling(&tmpTiling_);
+            auto temp2 = (uint32_t*)(tiling_.GetTiling());
+
+            constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
+            GlobalTensor<int64_t> tilingGlobal;
+            for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) {
+                Barrier();
+                tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i));
+                DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(tilingGlobal);
+            }
+
+            for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.Init(this->tiling_.GetTiling(), nullptr);
+        }
+    }
+
+    __aicore__ inline void SetSubBlockIdx(uint8_t idx)
+    {
+        mul.SetSubBlockIdx(idx);
+    }
+
+    __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
+            if (mul.GetSubBlockIdx() == 0) {
+                msgAux.msg0.orgM = msg->orgShape.orgM;
+                msgAux.msg0.orgN = msg->orgShape.orgN;
+                msgAux.msg0.orgKa = msg->orgShape.orgKa;
+                msgAux.msg0.orgKb = msg->orgShape.orgKb;
+                msgAux.msg0.orgKc = msg->orgShape.orgKc;
+                msgAux.msg0.setOrgShape = true;
+            } else {
+                msgAux.msg1.orgM = msg->orgShape.orgM;
+                msgAux.msg1.orgN = msg->orgShape.orgN;
+                msgAux.msg1.orgKa = msg->orgShape.orgKa;
+                msgAux.msg1.orgKb = msg->orgShape.orgKb;
+                msgAux.msg1.orgKc = msg->orgShape.orgKc;
+                msgAux.msg1.setOrgShape = true;
+            }
+        } else {
+            mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb,
+                msg->orgShape.orgKc);
+        }
+    }
+
+    __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTail) {
+            mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK);
+        }
+    }
+
+    __aicore__ inline void SetTail(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTail) {
+            mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK);
+        }
+    }
+
+    __aicore__ inline void SetHF32(__gm__ KfcMsg* msg)
+    {
+        mul.SetHF32(static_cast<bool>(msg->body.enHF32), static_cast<int32_t>(msg->body.hf32TransMode));
+    }
+
+    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setTensorA)
+            return;
+        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
+            SrcAT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr));
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorA(scalar);
+            return;
+        }
+        const uint64_t size = (uint64_t)(msg->body.sizeAmatrix);
+        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr, size);
+            mul.SetTensorA(scmLocal, msg->body.isTransA);
+        } else {
+            GlobalTensor<SrcAT> aGlobal;
+            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr), size);
+            mul.SetTensorA(aGlobal, msg->body.isTransA);
+        }
+    }
+
+    __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
+    {
+        if (!msg->body.setTensorA) {
+            return;
+        }
+        if constexpr (A_TYPE::format == CubeFormat::SCALAR) {
+            SrcAT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset);
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorA(scalar);
+            return;
+        }
+        if constexpr (PhyPosIsL1(A_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename A_TYPE::T>(msg->body.aAddr + offset, size);
+            mul.SetTensorA(scmLocal, msg->body.isTransA);
+        } else {
+            GlobalTensor<SrcAT> aGlobal;
+            aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr + offset), size);
+            mul.SetTensorA(aGlobal, msg->body.isTransA);
+        }
+    }
+
+    __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setQuant) {
+            return;
+        }
+        int quantMode = msg->body.quantMode;
+        if (quantMode == 1) {
+            uint64_t quantScalar = msg->body.quantScalar;
+            mul.SetQuantScalar(quantScalar);
+        } else if (quantMode == 2) {
+            const uint64_t size = static_cast<uint64_t>(msg->body.quantSize);
+            GlobalTensor<uint64_t> quantGlobal;
+            quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size);
+            mul.SetQuantVector(quantGlobal);
+        }
+    }
+
+    __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
+            return;
+        }
+        if (!msg->body.setBatch) {
+            return;
+        }
+        mul.SetBatchNum(msg->body.batchA, msg->body.batchB);
+    }
+
+    __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg)
+    {
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        mul.SetSelfDefineData(msg->body.dataPtr);
+        if constexpr (!ToMatmulConfig(MM_CFG).enableReuse) {
+            GlobalTensor<uint32_t> dataGlobal;
+            dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr));
+            DataCacheCleanAndInvalid<uint32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(dataGlobal);
+        }
+    }
+
+    __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg)
+    {
+        mul.SetUserDefInfo(msg->userDefInfo.tilingPtr);
+    }
+
+    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg)
+    {
+        if (!msg->body.setTensorB)
+            return;
+        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
+            SrcBT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr));
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorB(scalar);
+            return;
+        }
+        const uint64_t size = (uint64_t)(msg->body.sizeBmatrix);
+        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr, size);
+            mul.SetTensorB(scmLocal, msg->body.isTransB);
+        } else {
+            GlobalTensor<SrcBT> bGlobal;
+            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr), size);
+            mul.SetTensorB(bGlobal, msg->body.isTransB);
+        }
+    }
+
+    __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset)
+    {
+        if (!msg->body.setTensorB) {
+            return;
+        }
+        if constexpr (B_TYPE::format == CubeFormat::SCALAR) {
+            SrcBT scalar;
+            auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset);
+            auto temp2 = (uint8_t*)&scalar;
+
+            for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) {
+                *temp2 = *temp1;
+            }
+            mul.SetTensorB(scalar);
+            return;
+        }
+        if constexpr (PhyPosIsL1(B_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename B_TYPE::T>(msg->body.bAddr + offset, size);
+            mul.SetTensorB(scmLocal, msg->body.isTransB);
+        } else {
+            GlobalTensor<SrcBT> bGlobal;
+            bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr + offset), size);
+            mul.SetTensorB(bGlobal, msg->body.isTransB);
+        }
+    }
+
+    __aicore__ inline void SetBias(__gm__ KfcMsg* msg)
+    {
+        if (msg->body.setTensorBias) {
+            const uint64_t size = (uint64_t)tiling_.GetSingleCoreN();
+            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
+                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr, size);
+                mul.SetBias(scmLocal);
+            } else {
+                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
+                biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size);
+                mul.SetBias(biasGlobal);
+            }
+        } else if (msg->body.setClearBias) {
+            mul.DisableBias();
+        }
+    }
+
+    __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset)
+    {
+        if (msg->body.setTensorBias) {
+            const uint64_t size = (uint64_t)tiling_.GetSingleCoreN();
+            if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) {
+                const auto& scmLocal = GetTscmTensor<typename BIAS_TYPE::T>(msg->body.biasAddr + offset, size);
+                mul.SetBias(scmLocal);
+            } else {
+                GlobalTensor<typename BIAS_TYPE::T> biasGlobal;
+                biasGlobal.SetGlobalBuffer(
+                    reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size);
+                mul.SetBias(biasGlobal);
+            }
+        } else if (msg->body.setClearBias) {
+            mul.DisableBias();
+        }
+    }
+
+    __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        uint64_t size;
+        if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) {
+            size = ToMatmulConfig(MM_CFG).baseMN;
+        } else {
+            size = tiling_.GetBaseM() * tiling_.GetBaseN();
+        }
+        if constexpr (PhyPosIsL1(C_TYPE::pos)) {
+            const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
+            mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
+        } else {
+            GlobalTensor<DstT> cGlobal;
+
+            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+            mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite);
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        if (msg->body.sync == 1) { // Synchronize
+            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return false;
+    }
+
+    __aicore__ inline uint16_t GetInstID()
+    {
+        return instID;
+    }
+    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
+            if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) {
+                mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa,
+                    msgAux.msg0.orgKb, msgAux.msg0.orgKc);
+            } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) {
+                mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa,
+                    msgAux.msg1.orgKb, msgAux.msg1.orgKc);
+            }
+        }
+        if (msg->body.isFirstIter) {
+            SetTensorA(msg);
+            SetTensorB(msg);
+            if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
+                SetBias(msg);
+            }
+            if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) {
+                SetTail(msg);
+            }
+            if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector) {
+                SetQuantVector(msg);
+            }
+            if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) ||
+                ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
+                SetBatchNum(msg);
+            }
+            if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) {
+                SetSelfDefineData(msg);
+            }
+        }
+    }
+
+    __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize,
+        const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0)
+    {
+        if (msg->body.isFirstIter) {
+            SetTensorA(msg, batchASize, offsetA);
+            SetTensorB(msg, batchBSize, offsetB);
+            SetBias(msg, offsetBias);
+            SetTail(msg);
+            SetQuantVector(msg);
+            SetBatchNum(msg);
+        }
+    }
+
+    __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg)
+    {
+        if constexpr (A_TYPE::layout == LayoutMode::NONE) {
+            return true;
+        }
+        // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes.
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
+        if (msg->body.setQuant == 1) {
+            ASSERT(msg->body.quantMode != 1);  // scalar mode is not supported for quantization parameters in
+         // Batch MM
+        }
+#endif
+        IterateSetMessage(msg);
+        uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN();
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
+            msg->body.enSequentialWrite, msg->body.matrixStrideA,
+            msg->body.matrixStrideB, msg->body.matrixStrideC);
+
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        if (msg->body.sync || msg->body.waitIterateBatch) {
+            uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return true;
+    }
+
+    __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg)
+    {
+        if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) {
+            return true;
+        }
+        GlobalTensor<int64_t> msgGlobal;
+        msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+        DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
+        if (msg->body.setQuant == 1) {
+            ASSERT(msg->body.quantMode != 1);  // scalar mode is not supported for quantization parameters in
+         // Batch MM
+        }
+#endif
+        const uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN();
+        const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop;
+        uint64_t batchAOffset = tiling_.GetALayoutInfoD() * msg->body.batchA;
+        if constexpr (A_TYPE::layout != LayoutMode::SBNGD) {
+            batchAOffset = batchAOffset * tiling_.GetALayoutInfoS();
+        }
+        const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop;
+        uint64_t batchBOffset = tiling_.GetBLayoutInfoD() * msg->body.batchB;
+        if constexpr (B_TYPE::layout != LayoutMode::SBNGD) {
+            batchBOffset = batchBOffset * tiling_.GetBLayoutInfoS();
+        }
+        const uint64_t batchCOffset = tiling_.GetCLayoutInfoS2();
+        const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB;
+        bool layoutGCondition = tiling_.GetCLayoutInfoG() == 1 &&
+                                (tiling_.GetBLayoutInfoG() != 1 || tiling_.GetALayoutInfoG() != 1);
+        int32_t layoutG = tiling_.GetBLayoutInfoG() > tiling_.GetALayoutInfoG() ? tiling_.GetBLayoutInfoG() : tiling_.GetALayoutInfoG();
+        int32_t batchOffsetBias = tiling_.GetCLayoutInfoS2() * batchC;
+        if (layoutGCondition) {
+            batchOffsetBias = batchOffsetBias / layoutG;
+        }
+        int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T);
+        if constexpr (C_TYPE::layout != LayoutMode::SBNGD) {
+            batchOffsetC = batchOffsetC * tiling_.GetCLayoutInfoS1();
+        }
+        uint64_t offset = 0;
+        uint32_t cntIterator = 0;
+        for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) {
+            const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T);
+            const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T);
+            const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T);
+            IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset);
+            GlobalTensor<DstT> cGlobal;
+            cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size);
+            mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic),
+                msg->body.enSequentialWrite, msg->body.matrixStrideA,
+                msg->body.matrixStrideB, msg->body.matrixStrideC);
+            cntIterator++;
+            if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) {
+                uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+                NotifyEvent<PIPE_FIX>(eventID);
+            }
+            offset += batchOffsetC;
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+        if (msg->body.sync || msg->body.waitIterateBatch) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        } else if (cntIterator >= INC_PROCESS_CHECK) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        return true;
+    }
+    
+    __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) {
+            if (msg->body.iterateFakeMsg) {
+                if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg
+                    uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + kfcCommSrv->subBlockID);
+                    NotifyEvent<PIPE_FIX>(eventID);
+                    return true;
+                }
+            }
+        } else {
+            ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg.");
+        }
+        SyncCubeWithVec<A_TYPE::ibShare, B_TYPE::ibShare>();
+        if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+           IsSameType<DstT, half>::value) ||
+           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+           IsSameType<DstT, int8_t>::value) ||
+           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
+           IsSameType<DstT, uint8_t>::value))) {
+            GlobalTensor<int64_t> msgGlobal;
+            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        }
+        IterateSetMessage(msg);
+        uint64_t size;
+        if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) {
+            size = ToMatmulConfig(MM_CFG).singleCoreMN;
+        } else {
+            size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN();
+        }
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        const auto& scmLocal = GetTscmTensor<typename C_TYPE::T>(msg->body.cAddr, size);
+        uint64_t offset = 0;
+        uint64_t offsetSize = 0;
+        auto enSequentialWrite = msg->body.enSequentialWrite;
+        auto enAtomic = msg->body.enAtomic;
+        auto sync = msg->body.sync;
+        auto enPartialSum = msg->body.enPartialSum;
+        if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) {
+            ASSERT(msg->body.cAddr != 0); // The output address must be configured.
+            if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) {
+                offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0;
+            } else {
+                offsetSize = enSequentialWrite ? (tiling_.GetBaseM() * tiling_.GetBaseN()) : 0;
+            }
+        } else {
+            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
+                ASSERT(msg->body.cAddr != 0); // The output address must be configured.
+                if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) {
+                    offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0;
+                } else {
+                    offsetSize = enSequentialWrite ? (tiling_.GetBaseM() * tiling_.GetBaseN()) : 0;
+                }
+            } else if (sync == 0) {
+                // For asynchronous Iterate, the offset must be used for address calculation and
+                // the size is baseM x baseN.
+                if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) {
+                    offsetSize = ToMatmulConfig(MM_CFG).baseMN;
+                } else {
+                    offsetSize = tiling_.GetBaseM() * tiling_.GetBaseN();
+                }
+                enSequentialWrite = 1;
+            }
+        }
+        uint32_t cntIterator = 0;
+        TRACE_START(TraceId::MatMul_CALC);
+        // Asynchronous and configure the workspace
+        while (mul.Iterate(enPartialSum)) {
+            if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
+                if (unlikely(cntIterator == 0)) {
+                    if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) {
+                        TRACE_STOP(TraceId::MatMul_CALC);
+                        return false; // The queue is not switched, and no message needs to be returned.
+                    }
+                }
+            }
+            if constexpr (PhyPosIsL1(C_TYPE::pos)) {
+                mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite);
+            } else {
+                mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite);
+            }
+            cntIterator++;
+            if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) {
+                if (cntIterator < INC_PROCESS_CHECK) {
+                    if (funID == KFC_Enum::MMFUN_ITERATE) {
+                        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+                        NotifyEvent<PIPE_FIX>(eventID);
+                    }
+                }
+            }
+            offset += offsetSize;
+        }
+        // Now release UB
+        if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) ||
+            PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) {
+            if (unlikely(msg->ubAddr >= 0)) {
+                kfcCommSrv->FreeUB(msg->ubAddr);
+            }
+        }
+
+        uint16_t eventID = static_cast<uint16_t>(this->devEvtID * 2 + mul.GetSubBlockIdx());
+        if (sync || msg->body.waitIterateAll) {
+            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
+            NotifyEvent<PIPE_FIX>(eventID);
+        } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) {
+            NotifyEvent<PIPE_FIX>(eventID);
+        }
+        mul.End();
+        TRACE_STOP(TraceId::MatMul_CALC);
+        return true;
+    }
+
+    __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (A_TYPE::layout != LayoutMode::NONE) {
+            return true;
+        }
+        if constexpr (((IsSameType<SrcT, int8_t>::value || IsSameType<SrcT, int4b_t>::value) &&
+           IsSameType<DstT, half>::value) ||
+           ((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
+           IsSameType<DstT, int8_t>::value) ||
+           (IsSameType<SrcT, int8_t>::value && (IsSameType<DstT, int8_t>::value ||
+           IsSameType<DstT, uint8_t>::value))) {
+            GlobalTensor<int64_t> msgGlobal;
+            msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
+            DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
+        }
+        IterateSetMessage(msg);
+        if (mul.GetSubBlockIdx() == 0) {
+            return true;
+        }
+        uint64_t size;
+        if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) {
+            size = ToMatmulConfig(MM_CFG).singleCoreMN;
+        } else {
+            size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN();
+        }
+
+        GlobalTensor<DstT> cGlobal;
+        cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
+        mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite,
+            msg->body.waitIterateAll, msg->body.iterateFakeMsg);
+
+        uint16_t eventID0 = static_cast<uint16_t>(this->devEvtID * 2 + 0);
+        uint16_t eventID1 = static_cast<uint16_t>(this->devEvtID * 2 + 1);
+        if (msg->body.sync || msg->body.waitIterateAll) {
+            ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
+            NotifyEvent<PIPE_FIX>(eventID0);
+            NotifyEvent<PIPE_FIX>(eventID1);
+        }
+        if (!msg->body.iterateFakeMsg) {
+            mul.End();
+        }
+        TRACE_STOP(TraceId::MatMul_CALC);
+        return true;
+    }
+
+    __aicore__ inline bool IsSharedObj()
+    {
+        if constexpr (ToMatmulConfig(MM_CFG).enableInit) {
+            return false;
+        } else {
+            return true;
+        }
+    }
+
+    __aicore__ inline bool SkipMsg(KFC_Enum funID, bool& freeMsg,
+        int &lastMsgId, const int subBlockID)
+    {
+        if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) {
+            return false;
+        }
+        if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || ToMatmulConfig(MM_CFG).intraBlockPartSum) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
+                if (lastMsgId == subBlockID) {
+                    freeMsg = false;
+                    return true;
+                }
+                lastMsgId = subBlockID;
+                return false;
+            }
+            return false;
+        } else {
+            return false;
+        }
+    }
+
+    __aicore__ inline bool LockMsgQueue(KFC_Enum funID, bool& freeMsg,
+        int &lastMsgId, const int subBlockID)
+    {
+        if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) {
+            return true;
+        }
+        return false;
+    }
+
+    __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID)
+    {
+        if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) ||
+            ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) {
+            if ((static_cast<uint16_t>(funID) & static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) ==
+                static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) {
+                if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
+                    return IterateIntraBlockPartSum(msg, funID);
+                } else {
+                    return Iterate(msg, funID);
+                }
+            }
+        }
+        if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) &&
+                    (A_TYPE::layout != LayoutMode::NONE)) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) {
+                return IterateBatch(msg);
+            }
+        }
+        if constexpr (ToMatmulConfig(MM_CFG).enableEnd) {
+            if (funID == KFC_Enum::MMFUN_END) {
+                mul.End();
+            }
+        }
+        if constexpr (ToMatmulConfig(MM_CFG).enableGetTensorC) {
+            if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) {
+                return GetTensorC(msg);
+            }
+        }
+        if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) {
+            if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) {
+                SetOrgShape(msg);
+                return true;
+            }
+        }
+        if constexpr (ToMatmulConfig(MM_CFG).enableInit) {
+            if (funID == KFC_Enum::MMFUN_INIT) {
+                Init(msg);
+                return true;
+            }
+        }
+        if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0) &&
+                      (A_TYPE::layout != LayoutMode::NONE)) {
+            if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) {
+                return IterateNBatch(msg);
+            }
+        }
+        if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) {
+            SetUserDefInfo(msg);
+            return true;
+        }
+        if (funID == KFC_Enum::MMFUN_SET_HF32) {
+            SetHF32(msg);
+            return true;
+        }
+        ASSERT("illegal function ID.");
+        return true;
+    }
+
+    template <class T> __aicore__ LocalTensor<T> GetTscmTensor(uint64_t addr, const uint64_t size)
+    {
+        LocalTensor<T> scmLocal;
+        TBuffAddr scmTbuf;
+        scmTbuf.logicPos = (uint8_t)(TPosition::TSCM);
+        scmTbuf.dataLen = size * sizeof(DstT);
+        scmTbuf.bufferAddr = addr;
+#if ASCENDC_CPU_DEBUG
+        scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr;
+#endif
+        scmLocal.SetAddr(scmTbuf);
+        return scmLocal;
+    }
+
+private:
+    MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> mul;
+    GM_ADDR workspace;
+    KfcCommServer* kfcCommSrv;
+    TPipe* tpipe;
+    MatmulTiling<MM_CFG> tiling_;
+    TCubeTiling tmpTiling_; // for compatible with init interface
+    typename IBShareCache<IsIBShare<A_TYPE, B_TYPE>()>::ShareCache gCache;
+    typename  ShareMatmulAux<!ToMatmulConfig(MM_CFG).enableInit>::MSG msgAux;
+    uint16_t instID;
+    uint16_t devEvtID;
+};
+
+template <const auto& MM_CFG = CFG_NORM>
+__aicore__ inline constexpr bool IsSharedMatmul()
+{
+    return !matmul::ToMatmulConfig(MM_CFG).enableInit;
+}
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE,
+    const auto& MM_CFG = CFG_NORM, class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>,
+    MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)>
+struct MatmulInstBase {
+    __aicore__ inline MatmulInstBase(){};
+};
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
+    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
+struct MatmulInstShared : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
+    __aicore__ inline MatmulInstShared(){};
+    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[1];
+};
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
+    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
+struct MatmulInst : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
+    __aicore__ inline MatmulInst(){};
+    matmul::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[MIX_NUM];
+};
+
+template <bool SHARED, class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG,
+    class MM_CB, MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
+struct MatmulInstAux {
+    __aicore__ inline MatmulInstAux(){};
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
+    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
+struct MatmulInstAux<true, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
+    __aicore__ inline MatmulInstAux(){};
+    using MATMUL = MatmulInstShared<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
+    MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
+struct MatmulInstAux<false, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
+    __aicore__ inline MatmulInstAux(){};
+    using MATMUL = MatmulInst<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>;
+};
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
+class MM_CB = matmul::MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)>
+class MatmulServiceAux {
+    using SrcT = typename A_TYPE::T;
+    using SrcAT = typename A_TYPE::T;
+    using SrcBT = typename B_TYPE::T;
+    using DstT = typename C_TYPE::T;
+    using BiasT = typename BIAS_TYPE::T;
+    friend class KfcServer;
+    constexpr static bool aIbShare = A_TYPE::ibShare;
+    constexpr static bool bIbShare = B_TYPE::ibShare;
+public:
+    __aicore__ inline MatmulServiceAux() {}
+    typename MatmulInstAux<IsSharedMatmul<MM_CFG>(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB,
+        MATMUL_POLICY>::MATMUL cubeObj;
+
+    // stub functions for MatmulImpl
+    __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){};
+
+    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){};
+    __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){};
+    __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){};
+    __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){};
+
+    __aicore__ inline void SetTensorA(const GlobalTensor<SrcAT>& gm, bool isTransposeA = false){};
+
+    __aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT>& leftMatrix,
+        bool isTransposeA = false){};
+    __aicore__ inline void SetTensorB(const GlobalTensor<SrcBT>& gm, bool isTransposeB = false){};
+
+    __aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT>& rightMatrix,
+        bool isTransposeB = false){};
+    __aicore__ inline void SetBias(const GlobalTensor<BiasT>& biasGlobal){};
+    __aicore__ inline void SetTensorA(const LocalTensor<SrcAT>& leftMatrix, bool isTransposeA = false){};
+    __aicore__ inline void SetTensorB(const LocalTensor<SrcBT>& rightMatrix, bool isTransposeB = false){};
+    __aicore__ inline void SetBias(const LocalTensor<BiasT>& inputBias){};
+    __aicore__ inline void SetTensorA(SrcAT aScalar){};
+    __aicore__ inline void SetTensorB(SrcBT bScalar){};
+    __aicore__ inline void DisableBias(){};
+    __aicore__ inline void ClearBias(){};
+    __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {}
+    __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {}
+    __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {}
+    __aicore__ inline void SetQuantVector(const GlobalTensor<uint64_t>& quantTensor) {}
+    template <class T> __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {};
+    template <class T> __aicore__ inline void SetWorkspace(GlobalTensor<T>& addr){};
+    __aicore__ inline void End(){};
+    __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0){};
+
+    template <bool sync = true> __aicore__ inline bool Iterate(bool enPartialSum = false)
+    {
+        return false;
+    };
+    template <bool sync = true>
+    __aicore__ inline void IterateAll(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){};
+    template <bool sync = true>
+    __aicore__ inline void IterateAll(const LocalTensor<DstT>& ubCmatrix, uint8_t enAtomic = 0){};
+    __aicore__ inline void WaitIterateAll() {};
+    template <bool sync = true, bool doPad = false>
+    __aicore__ inline void GetTensorC(const LocalTensor<DstT>& c, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
+        uint32_t dstGap = 0) {};
+    template <bool sync = true>
+    __aicore__ inline void GetTensorC(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false){};
+    template <bool sync = true>
+    __aicore__ inline void GetTensorC(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &cLocal,
+        uint8_t enAtomic = 0, bool enSequentialWrite = false) {};
+    template <bool sync = true>
+    __aicore__ inline GlobalTensor<DstT> GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false)
+    {
+        GlobalTensor<DstT> global;
+        return global;
+    };
+    template <bool sync = true, bool waitIterateBatch = false>
+    __aicore__ inline void IterateBatch(const GlobalTensor<DstT>& gm, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true>
+    __aicore__ inline void IterateBatch(const LocalTensor<DstT>& ubCmatrix, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true, bool waitIterateBatch = false>
+    __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0,
+        const uint32_t matrixStrideC = 0) {};
+    template <bool sync = true>
+    __aicore__ inline GlobalTensor<DstT> GetBatchTensorC(uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite = false) {};
+    template <bool sync = true>
+    __aicore__ inline GlobalTensor<DstT> GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {};
+    template <bool sync = true, bool doPad = false>
+    __aicore__ inline void GetBatchTensorC(const LocalTensor<DstT>& c, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
+        uint32_t dstGap = 0) {};
+    template <bool sync = true, bool doPad = false>
+    __aicore__ inline void GetBatchC(const LocalTensor<DstT>& c, uint32_t batchA, uint32_t batchB,
+        bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0,
+        uint32_t dstGap = 0) {};
+    __aicore__ inline void WaitIterateBatch() {};
+    __aicore__ inline void SetLocalWorkspace(const LocalTensor<uint8_t>& tmpBuffer) {};
+    __aicore__ inline void AsyncGetTensorC(const LocalTensor<DstT>& c){};
+    __aicore__ inline void WaitGetTensorC(){};
+    template <bool isTurnOnDebug = true>
+    __aicore__ inline MatrixOffset GetOffsetC()
+    {
+        if constexpr (isTurnOnDebug) {
+            static_assert(!isTurnOnDebug, "unsupported!");
+        }
+    }
+};
+} // namespace matmul
+>>>>>>> e30c307 (KFC常量化适配)
 #endif // __MATMUL_SERVER_H__
\ No newline at end of file
diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h
index d53e35c9058d50f465f14b9aeeb51e91862dc7cf..5ad93984c162be4f6e07e487993de8b6b199daa2 100644
--- a/impl/matmul/matmul_utils.h
+++ b/impl/matmul/matmul_utils.h
@@ -458,19 +458,14 @@ template <class T, class U>
 __aicore__ inline void InitKfcClient(T& matmulClient, U* tiling, TPipe* tpipe, KfcCommClient* client, int instIdx,
     GM_ADDR workspace)
 {
-    ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when InitKFC");
     ASSERT(client != nullptr && "client cannot be nullptr when InitKFC");
     ASSERT(workspace != nullptr && "workspace cannot be nullptr when InitKFC");
     ASSERT(instIdx >= 0);
     matmulClient.client = client;
     matmulClient.instIdx = instIdx;
-    matmulClient.tpipe = tpipe;
+    matmulClient.cubeTiling.SetTiling((TCubeTiling*)tiling);
     matmulClient.mmCntAddr_ = reinterpret_cast<__gm__ KfcMsg*>(GetMatmulIncAddr(workspace, GetBlockIdxImpl(), instIdx));
-    if (tiling) {
-        matmulClient.InitStatic((const TCubeTiling*)tiling);
-    } else {
-        matmulClient.cubeTiling.SetTiling(nullptr);
-    }
+    matmulClient.InitStatic();
     matmulClient.devEvtID = instIdx * 2 + GetSubBlockIdxImpl();
 }
 #endif
diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h
index 69b139c7ae3226b6bb620217f11f697c889520ae..8665bf049ae7bc32cf5891358194bc5eabcce209 100644
--- a/lib/matmul/matmul_client.h
+++ b/lib/matmul/matmul_client.h
@@ -93,7 +93,9 @@ public:
     template <class T> __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size)
     {
         ASSERT(addr != nullptr);
-        ASSERT(!this->cubeTiling.IsNull());
+        if constexpr (ToMatmulConfig(MM_CFG).singleCoreM == 0) {
+            ASSERT(!this->cubeTiling.IsNull());
+        }
 
         uint64_t offset = mnIter_ * cubeTiling.GetBaseN() * cubeTiling.GetBaseM() * sizeof(DstT);
         cacheWorkspaceAddr = reinterpret_cast<GM_ADDR>(const_cast<__gm__ T*>(addr));
@@ -808,13 +810,12 @@ private:
         GM_ADDR workspace);
 
 private:
-    __aicore__ inline void InitStatic(const TCubeTiling* cubeTiling)
+    __aicore__ inline void InitStatic()
     {
+        if (ToMatmulConfig(MM_CFG).singleCoreM == 0 && this->cubeTiling.IsNull()) {
+            return;
+        }
         ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0);
-        ASSERT(cubeTiling != nullptr && "cubeTiling cannot be nullptr when init matmul client");
-        ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0);
-
-        this->cubeTiling.SetTiling(cubeTiling);
 
         *((uint64_t*)&kfcMsg_) = 0;
         *((uint64_t*)&(kfcMsg_.body)) = 0;