diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h index d114f1b89777f7b4731cb63bec60e762f7879647..8eaf80b32269533288e3301e2cb45a8680c18dc4 100644 --- a/impl/matmul/modules/matmul_param.h +++ b/impl/matmul/modules/matmul_param.h @@ -277,6 +277,10 @@ struct MatmulParamsMDL : public MatmulParamsBase= 220 + int sMadMStep_ = 0; + int sMadNStep_ = 0; +#endif uint64_t dataPtr_; uint64_t tilingPtr_; GlobalTensor quantTensor_; diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index b7fac37bb906516a99636518e14adc739e7244ad..622c10ac71dc78a82179084cbbfe15af0da42292 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -98,16 +98,6 @@ public: bool enPartialSum, uint8_t enAtomic, bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, const uint32_t matrixStrideC = 0); - template - __aicore__ inline void GetTensorCImpl(const LocalTensor& co2Local, uint8_t enAtomic = 0, - bool enSequentialWrite = false); - template - __aicore__ inline void GetTensorCImpl(const GlobalTensor& gm, uint8_t enAtomic = 0, - bool enSequentialWrite = false); - template - __aicore__ inline void GetTensorCImpl(const GlobalTensor &gm, const LocalTensor &co2Local, - uint8_t enAtomic = 0, bool enSequentialWrite = false); - template __aicore__ inline void GetTensorC(const LocalTensor& co2Local, uint8_t enAtomic = 0, bool enSequentialWrite = false); @@ -263,6 +253,15 @@ private: const uint32_t matrixStrideB = 0, const int32_t batchOuterIdx = 0); __aicore__ inline void ComputeNormL0DB(bool enPartialSum); __aicore__ inline void ComputeMDLL0DB(bool enPartialSum); + template + __aicore__ inline void GetTensorCImpl(const LocalTensor& co2Local, uint8_t enAtomic = 0, + bool enSequentialWrite = false); + template + __aicore__ inline void GetTensorCImpl(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false); + template + __aicore__ inline void GetTensorCImpl(const GlobalTensor &gm, const LocalTensor &co2Local, + uint8_t enAtomic = 0, bool enSequentialWrite = false); __aicore__ inline void CalcBatchNum(const int32_t batchNumA, const int32_t batchNumB); // 1, Implement CacheA. The number of caches is depthA1. __aicore__ inline LocalTensor LoadACache2L1(int row, int col, int useM, int useK, int proc);