From 973d6e800cf1776d4afdcd4e15760f3937249a59 Mon Sep 17 00:00:00 2001 From: kangshuai Date: Sat, 10 Aug 2024 14:48:27 +0800 Subject: [PATCH] fix GetTensorCImpl bug --- impl/matmul/modules/matmul_param.h | 4 ++++ lib/matmul/matmul.h | 19 +++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h index d114f1b8..8eaf80b3 100644 --- a/impl/matmul/modules/matmul_param.h +++ b/impl/matmul/modules/matmul_param.h @@ -277,6 +277,10 @@ struct MatmulParamsMDL : public MatmulParamsBase= 220 + int sMadMStep_ = 0; + int sMadNStep_ = 0; +#endif uint64_t dataPtr_; uint64_t tilingPtr_; GlobalTensor quantTensor_; diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index b7fac37b..622c10ac 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -98,16 +98,6 @@ public: bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0); - template - __aicore__ inline void GetTensorCImpl(const LocalTensor& co2Local, uint8_t enAtomic = 0, - bool enSequentialWrite = false); - template - __aicore__ inline void GetTensorCImpl(const GlobalTensor& gm, uint8_t enAtomic = 0, - bool enSequentialWrite = false); - template - __aicore__ inline void GetTensorCImpl(const GlobalTensor &gm, const LocalTensor &co2Local, - uint8_t enAtomic = 0, bool enSequentialWrite = false); - template __aicore__ inline void GetTensorC(const LocalTensor& co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false); @@ -263,6 +253,15 @@ private: const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0); __aicore__ inline void ComputeNormL0DB(bool enPartialSum); __aicore__ inline void ComputeMDLL0DB(bool enPartialSum); + template + __aicore__ inline void GetTensorCImpl(const LocalTensor& co2Local, uint8_t enAtomic = 0, + bool enSequentialWrite = false); + template + __aicore__ inline void GetTensorCImpl(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false); + template + __aicore__ inline void GetTensorCImpl(const GlobalTensor &gm, const LocalTensor &co2Local, + uint8_t enAtomic = 0, bool enSequentialWrite = false); __aicore__ inline void CalcBatchNum(const int32_t batchNumA, const int32_t batchNumB); // 1, Implement CacheA. The number of caches is depthA1. __aicore__ inline LocalTensor LoadACache2L1(int row, int col, int useM, int useK, int proc); -- Gitee