From 973d6e800cf1776d4afdcd4e15760f3937249a59 Mon Sep 17 00:00:00 2001
From: kangshuai <kangshuai12@hisilicon.com>
Date: Sat, 10 Aug 2024 14:48:27 +0800
Subject: [PATCH] fix GetTensorCImpl bug

---
 impl/matmul/modules/matmul_param.h |  4 ++++
 lib/matmul/matmul.h                | 19 +++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h
index d114f1b8..8eaf80b3 100644
--- a/impl/matmul/modules/matmul_param.h
+++ b/impl/matmul/modules/matmul_param.h
@@ -277,6 +277,10 @@ struct MatmulParamsMDL : public MatmulParamsBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TY
     int baseMN_;
     int cacheA1Factor_, cacheB1Factor_;
     uint64_t quantScalar_ = 0;
+#if __CCE_AICORE__ >= 220
+    int sMadMStep_ = 0;
+    int sMadNStep_ = 0;
+#endif
     uint64_t dataPtr_;
     uint64_t tilingPtr_;
     GlobalTensor<uint64_t> quantTensor_;
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index b7fac37b..622c10ac 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -98,16 +98,6 @@ public:
         bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0,
         const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0);
 
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false);
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
-        bool enSequentialWrite = false);
-    template <bool sync = true>
-    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
-        uint8_t enAtomic = 0, bool enSequentialWrite = false);
-
     template <bool sync = true>
     __aicore__ inline void GetTensorC(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
         bool enSequentialWrite = false);
@@ -263,6 +253,15 @@ private:
         const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0);
     __aicore__ inline void ComputeNormL0DB(bool enPartialSum);
     __aicore__ inline void ComputeMDLL0DB(bool enPartialSum);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const LocalTensor<DstT>& co2Local, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT>& gm, uint8_t enAtomic = 0,
+        bool enSequentialWrite = false);
+    template <bool sync = true>
+    __aicore__ inline void GetTensorCImpl(const GlobalTensor<DstT> &gm, const LocalTensor<DstT> &co2Local,
+        uint8_t enAtomic = 0, bool enSequentialWrite = false);
     __aicore__ inline void CalcBatchNum(const int32_t batchNumA, const int32_t batchNumB);
     // 1, Implement CacheA. The number of caches is depthA1.
     __aicore__ inline LocalTensor<SrcT> LoadACache2L1(int row, int col, int useM, int useK, int proc);
-- 
Gitee