diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h
index d114f1b89777f7b4731cb63bec60e762f7879647..8eaf80b32269533288e3301e2cb45a8680c18dc4 100644
--- a/impl/matmul/modules/matmul_param.h
+++ b/impl/matmul/modules/matmul_param.h
@@ -277,6 +277,10 @@ struct MatmulParamsMDL : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TY
     int baseMN_;
     int cacheA1Factor_, cacheB1Factor_;
     uint64_t quantScalar_ = 0;
+#if __CCE_AICORE__ >= 220
+    int sMadMStep_ = 0;
+    int sMadNStep_ = 0;
+#endif
     uint64_t dataPtr_;
     uint64_t tilingPtr_;
     GlobalTensor<uint64_t> quantTensor_;
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index b7fac37bb906516a99636518e14adc739e7244ad..622c10ac71dc78a82179084cbbfe15af0da42292 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -98,16 +98,6 @@ public:
         bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
         const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0);
 
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false);
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false);
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
-        uint8_t enAtomic = 0, bool enSequentialWrite = false);
-
     template <bool sync = true>
     __aicore__ inline void GetTensorC(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
         bool enSequentialWrite = false);
@@ -263,6 +253,15 @@ private:
         const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0);
     __aicore__ inline void ComputeNormL0DB(bool enPartialSum);
     __aicore__ inline void ComputeMDLL0DB(bool enPartialSum);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
+        uint8_t enAtomic = 0, bool enSequentialWrite = false);
     __aicore__ inline void CalcBatchNum(const int32_t batchNumA, const int32_t batchNumB);
     // 1, Implement CacheA. The number of caches is depthA1.
     __aicore__ inline LocalTensor<SrcT> LoadACache2L1(int row, int col, int useM, int useK, int proc);