From ef57262c0ddecd31c9b80db1cc5ccc8476650b31 Mon Sep 17 00:00:00 2001
From: wang-xiangX <wangxiang184@hisilicon.com>
Date: Wed, 24 Jul 2024 15:46:43 +0800
Subject: [PATCH] solve const &

---
 .../activation/softmax/logsoftmax_base_impl.h | 56 +++++++++----------
 impl/activation/softmax/softmax_common.h      | 10 ++--
 .../softmax/v200/simple_softmax_impl.h        |  6 +-
 .../softmax/v200/softmax_common_impl.h        |  6 +-
 .../softmax/v200/softmax_flashv2_impl.h       | 32 +++++------
 .../softmax/v200/softmax_grad_impl.h          | 10 ++--
 impl/activation/softmax/v200/softmax_impl.h   | 26 ++++-----
 .../softmax/v220/simple_softmax_impl.h        |  6 +-
 .../softmax/v220/softmax_common_impl.h        |  4 +-
 .../softmax/v220/softmax_flashv2_impl.h       | 32 +++++------
 .../softmax/v220/softmax_grad_impl.h          | 10 ++--
 impl/activation/softmax/v220/softmax_impl.h   | 26 ++++-----
 .../softmax/v300/softmax_flashv2_impl.h       |  6 +-
 impl/activation/softmax/v300/softmax_impl.h   |  8 +--
 impl/activation/swiglu/swiglu_common_impl.h   | 14 ++---
 .../quant/ascend_quant_common_impl.h          |  2 +-
 lib/activation/swiglu.h                       |  8 +--
 17 files changed, 131 insertions(+), 131 deletions(-)
diff --git a/impl/activation/softmax/logsoftmax_base_impl.h b/impl/activation/softmax/logsoftmax_base_impl.h
index 7d436140..9a801913 100644
--- a/impl/activation/softmax/logsoftmax_base_impl.h
+++ b/impl/activation/softmax/logsoftmax_base_impl.h
@@ -57,8 +57,8 @@ constexpr float SCALAR_NATURE_LOG_10 = 0.4342944819;
 
 __aicore__ inline void LogSoftMaxGenericNZImpl(LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     const UnaryRepeatParams unaryParams;
     const uint64_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT;
@@ -72,8 +72,8 @@ __aicore__ inline void LogSoftMaxGenericNZImpl(LocalTensor<float>& dst, const Lo
 }
 
 __aicore__ inline void LogSoftMaxGenericNZReduceMaxImpl(const LocalTensor<float>& tmpBuffer0,
-    const LocalTensor<float>& tmpBuffer1, const LocalTensor<half>& maxTensor, const uint32_t& offset2,
-    const uint32_t& splitCount, uint64_t mask[2], const ReduceLastND& reduceParam)
+    const LocalTensor<float>& tmpBuffer1, const LocalTensor<half>& maxTensor, const uint32_t offset2,
+    const uint32_t splitCount, uint64_t mask[2], const ReduceLastND& reduceParam)
 {
     ReduceMaxLastNZImpl(tmpBuffer1, tmpBuffer0, mask, reduceParam);
     PipeBarrier<PIPE_V>();
@@ -84,10 +84,10 @@ __aicore__ inline void LogSoftMaxGenericNZReduceMaxImpl(const LocalTensor<float>
         { 1, 1, HALF_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE });
 }
 
-__aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t& splitNZBlockCount,
-    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t& splitOffset,
-    const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2],
-    const uint32_t& lastBlockMaskLen, const uint32_t& splitCount)
+__aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t splitNZBlockCount,
+    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t splitOffset,
+    const uint32_t lastSplitNZBlockOffset, uint64_t mask[2],
+    const uint32_t lastBlockMaskLen, const uint32_t splitCount)
 {
     for (uint32_t j = 0; j < splitNZBlockCount - 1; j++) {
         Sub<float, false>(tmpBuffer0[splitOffset * j], tmpBuffer0[splitOffset * j], tmpBuffer1, MASK_PLACEHOLDER, 1,
@@ -102,8 +102,8 @@ __aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t& splitNZBlockCo
 }
 
 __aicore__ inline void LogSoftMaxGenericNZReduceSumImpl(const LocalTensor<float>& tmpBuffer0,
-    const LocalTensor<float>& tmpBuffer1, const LocalTensor<half>& sumTensor, const uint32_t& offset2,
-    const uint32_t& splitCount, uint64_t mask[2], const ReduceLastND& reduceParam)
+    const LocalTensor<float>& tmpBuffer1, const LocalTensor<half>& sumTensor, const uint32_t offset2,
+    const uint32_t splitCount, uint64_t mask[2], const ReduceLastND& reduceParam)
 {
     ReduceSumLastNZImpl(tmpBuffer1, tmpBuffer0, mask, reduceParam);
     PipeBarrier<PIPE_V>();
@@ -115,10 +115,10 @@ __aicore__ inline void LogSoftMaxGenericNZReduceSumImpl(const LocalTensor<float>
     PipeBarrier<PIPE_V>();
 }
 
-__aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t& splitNZBlockCount,
-    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t& splitOffset,
-    const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t& lastBlockMaskLen,
-    const uint32_t& splitCount)
+__aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t splitNZBlockCount,
+    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t splitOffset,
+    const uint32_t lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t lastBlockMaskLen,
+    const uint32_t splitCount)
 {
     for (uint32_t j = 0; j < splitNZBlockCount - 1; j++) {
         Div<float, false>(tmpBuffer0[splitOffset * j], tmpBuffer0[splitOffset * j], tmpBuffer1, MASK_PLACEHOLDER, 1,
@@ -130,9 +130,9 @@ __aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t& splitNZBlockCo
         mask, lastBlockMaskLen, splitCount, Div<float>);
 }
 
-__aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t& splitNZBlockCount,
-    const LocalTensor<float>& tmpBuffer0, const LocalTensor<half>& dst, const uint32_t& splitOffset,
-    const uint32_t& splitCount, const SoftMaxTiling& tiling, const uint32_t& offset1)
+__aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t splitNZBlockCount,
+    const LocalTensor<float>& tmpBuffer0, const LocalTensor<half>& dst, const uint32_t splitOffset,
+    const uint32_t splitCount, const SoftMaxTiling& tiling, const uint32_t offset1)
 {
     SetMaskCount();
     SetVectorMask<float, MaskMode::COUNTER>(0, splitCount);
@@ -154,10 +154,10 @@ __aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t& splitNZBlockCo
     ResetMask();
 }
 
-__aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t& splitNZBlockCount,
-    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t& splitOffset,
-    const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t& lastBlockMaskLen,
-    const uint32_t& splitCount)
+__aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t splitNZBlockCount,
+    const LocalTensor<float>& tmpBuffer0, const LocalTensor<float>& tmpBuffer1, const uint32_t splitOffset,
+    const uint32_t lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t lastBlockMaskLen,
+    const uint32_t splitCount)
 {
     SetMaskCount();
     SetVectorMask<float, MaskMode::COUNTER>(0, splitCount);
@@ -176,8 +176,8 @@ __aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t& splitNZBlockCo
 
 __aicore__ inline void LogSoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -287,8 +287,8 @@ __aicore__ inline void GenericLogNDImpl(const LocalTensor<float>& dst, const Loc
 
 __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal; // need splitM * 64
     const UnaryRepeatParams unaryParams;
@@ -314,8 +314,8 @@ __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor<float>& dst, co
 
 __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
@@ -351,7 +351,7 @@ __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor<half>& dst, con
 
 __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
+    const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h
index dd7ea1a7..228fea59 100644
--- a/impl/activation/softmax/softmax_common.h
+++ b/impl/activation/softmax/softmax_common.h
@@ -362,7 +362,7 @@ __aicore__ inline void AlignedBrcbImpl(const LocalTensor<T>& dstLocal, const Loc
 }
 
 __aicore__ inline void ContinusColumnBrcbImpl(const LocalTensor<float>& dstLocal, const LocalTensor<float>& srcLocal,
-    const uint32_t& repeat, const uint32_t& brcbCount)
+    const uint32_t repeat, const uint32_t brcbCount)
 {
     float scalarList[SCALAR_STACK_DEPTH] = {0};
     SetVectorMask<float>(brcbCount);
@@ -393,7 +393,7 @@ __aicore__ inline void ContinusColumnBrcbImpl(const LocalTensor<float>& dstLocal
 }
 
 __aicore__ inline void AlignedColumnBrcbImpl(const LocalTensor<float>& dstLocal, const LocalTensor<float>& srcLocal,
-    const uint32_t& repeat, const uint32_t& brcbCount)
+    const uint32_t repeat, const uint32_t brcbCount)
 {
     float scalarList[SCALAR_STACK_DEPTH] = {0};
     SetVectorMask<float>(brcbCount);
@@ -474,7 +474,7 @@ __aicore__ inline void BroadCastLastCompute(const LocalTensor<T>& dst, const Loc
     }
 }
 
-__aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t& maskLen, const uint32_t& nzBlockCount)
+__aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t maskLen, const uint32_t nzBlockCount)
 {
     // create mask in "01111111 11111111 01111111 11111111" format
     // maskLen is 1-15
@@ -494,7 +494,7 @@ __aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t
 }
 
 __aicore__ inline void BinaryComputeWithSpecialMask(const LocalTensor<float>& dst, const LocalTensor<float>& src0,
-    const LocalTensor<float>& src1, uint64_t mask[2], const uint32_t& lastBlockMaskLen, const uint32_t& splitCount,
+    const LocalTensor<float>& src1, uint64_t mask[2], const uint32_t lastBlockMaskLen, const uint32_t splitCount,
     void (*func)(const LocalTensor<float>&, const LocalTensor<float>&, const LocalTensor<float>&, uint64_t*,
     const uint8_t, const BinaryRepeatParams&))
 {
@@ -526,7 +526,7 @@ __aicore__ inline void BinaryComputeWithSpecialMask(const LocalTensor<float>& ds
 }
 
 __aicore__ inline void UnaryComputeWithSpecialMask(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    uint64_t mask[2], const uint32_t& lastBlockMaskLen, const uint32_t& splitCount,
+    uint64_t mask[2], const uint32_t lastBlockMaskLen, const uint32_t splitCount,
     void (*func)(const LocalTensor<float>&, const LocalTensor<float>&, uint64_t*, const uint8_t,
     const UnaryRepeatParams&))
 {
diff --git a/impl/activation/softmax/v200/simple_softmax_impl.h b/impl/activation/softmax/v200/simple_softmax_impl.h
index 85ddba56..06b514c5 100644
--- a/impl/activation/softmax/v200/simple_softmax_impl.h
+++ b/impl/activation/softmax/v200/simple_softmax_impl.h
@@ -20,7 +20,7 @@
 namespace AscendC {
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT;
     LocalTensor<float> tmpBuffer0 = workLocal;
@@ -60,7 +60,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<float>& dst,
 
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& inSumTensor,
     const LocalTensor<half>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -110,7 +110,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst,
 
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v200/softmax_common_impl.h b/impl/activation/softmax/v200/softmax_common_impl.h
index c28d129f..a4240166 100644
--- a/impl/activation/softmax/v200/softmax_common_impl.h
+++ b/impl/activation/softmax/v200/softmax_common_impl.h
@@ -190,7 +190,7 @@ __aicore__ inline void FirstBlockCopyImpl(const LocalTensor<float>& dst, const L
 }
 
 __aicore__ inline void BroadCastImpl(const LocalTensor<float>& dstLocal, const LocalTensor<float>& srcLocal,
-    const uint32_t& repeat, const uint32_t& brcbCount)
+    const uint32_t repeat, const uint32_t brcbCount)
 {
     float scalarList[SCALAR_STACK_DEPTH] = {0};
     SetVectorMask<float, MaskMode::NORMAL>(brcbCount);
@@ -389,7 +389,7 @@ __aicore__ inline void NewReduceSumLastNDImpl(const LocalTensor<float>& dst, con
 }
 
 __aicore__ inline void ReduceMaxSingleBlockNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const uint64_t& mask, const ReduceLastND& reduceParam)
+    const uint64_t mask, const ReduceLastND& reduceParam)
 {
     const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES;
     const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES;
@@ -470,7 +470,7 @@ __aicore__ inline void ReduceMaxLastNZImpl(const LocalTensor<float>& tmpBuffer1,
 }
 
 __aicore__ inline void ReduceSumSingleBlockNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const uint64_t& mask, const ReduceLastND& reduceParam)
+    const uint64_t mask, const ReduceLastND& reduceParam)
 {
     const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES;
     const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES;
diff --git a/impl/activation/softmax/v200/softmax_flashv2_impl.h b/impl/activation/softmax/v200/softmax_flashv2_impl.h
index 0006ef1b..465b6e21 100644
--- a/impl/activation/softmax/v200/softmax_flashv2_impl.h
+++ b/impl/activation/softmax/v200/softmax_flashv2_impl.h
@@ -21,8 +21,8 @@ namespace AscendC {
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& expMaxTensor,
     const LocalTensor<float>& inSumTensor, const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16
@@ -116,8 +116,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<float>& dst,
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<half>& inSumTensor, const LocalTensor<half>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -209,8 +209,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst,
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<float>& inSumTensor, const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16
@@ -768,8 +768,8 @@ __aicore__ inline void SoftmaxFlashV2BasicBlockImpl(const LocalTensor<float>& ds
 __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, const LocalTensor<half>& expSumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<half>& inExpSumTensor, const LocalTensor<half>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
@@ -839,7 +839,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<float>& dst, c
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& expMaxTensor,
     const LocalTensor<float>& inExpSumTensor, const LocalTensor<float>& inMaxTensor,
     const LocalTensor<float>& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling,
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.reduceSize]; // need splitM * 64
@@ -1044,8 +1044,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateBasicBlock(const LocalTensor<float>
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst, const LocalTensor<half>& expSumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& reduceBuffer = workLocal[tiling.splitSize];
@@ -1069,8 +1069,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst,
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<float>& dst, const LocalTensor<float>& expSumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
 
@@ -1299,7 +1299,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, co
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<float>& inExpSumTensor, const LocalTensor<float>& inMaxTensor,
     const LocalTensor<float>& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling,
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
@@ -1339,8 +1339,8 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, co
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst, const LocalTensor<float>& expSumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v200/softmax_grad_impl.h b/impl/activation/softmax/v200/softmax_grad_impl.h
index 531ab4d7..b913c88d 100644
--- a/impl/activation/softmax/v200/softmax_grad_impl.h
+++ b/impl/activation/softmax/v200/softmax_grad_impl.h
@@ -20,7 +20,7 @@
 namespace AscendC {
 __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& gradTensor,
     const LocalTensor<half>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -60,8 +60,8 @@ __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<half>& ds
 
 __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<float>& dst,
     const LocalTensor<float>& gradTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -268,7 +268,7 @@ __aicore__ inline void SoftmaxGradFrontNDImpl(const LocalTensor<T>& dstTensor, c
 
 __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& gradTensor,
     const LocalTensor<half>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -328,7 +328,7 @@ __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<half>& dst, co
 
 __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& gradTensor,
     const LocalTensor<float>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h
index 2f76d21e..7e34f855 100644
--- a/impl/activation/softmax/v200/softmax_impl.h
+++ b/impl/activation/softmax/v200/softmax_impl.h
@@ -21,8 +21,8 @@ namespace AscendC {
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -110,8 +110,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT;
     const uint32_t splitNZBlockCount = tiling.srcK / SOFTMAX_SHAPE_NZ_BASIC_COUNT;
@@ -189,8 +189,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<float>& dst, const
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -524,8 +524,8 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor<float>& dst, const Lo
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal; // need splitM * 64
 
@@ -546,8 +546,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
@@ -707,7 +707,7 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor<half>& dst, const Loc
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
@@ -771,7 +771,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTe
 }
 
 __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<half>& dst, const LocalTensor<half>& src,
-    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize,
+    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
@@ -797,7 +797,7 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<half>& dst, const Loc
 }
 
 __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize,
+    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
diff --git a/impl/activation/softmax/v220/simple_softmax_impl.h b/impl/activation/softmax/v220/simple_softmax_impl.h
index 5bbe7d10..60a70d31 100644
--- a/impl/activation/softmax/v220/simple_softmax_impl.h
+++ b/impl/activation/softmax/v220/simple_softmax_impl.h
@@ -20,7 +20,7 @@
 namespace AscendC {
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT;
     LocalTensor<float> tmpBuffer0 = workLocal;
@@ -60,7 +60,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<float>& dst,
 
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& inSumTensor,
     const LocalTensor<half>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -110,7 +110,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst,
 
 __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v220/softmax_common_impl.h b/impl/activation/softmax/v220/softmax_common_impl.h
index 6dbc213c..67c64b17 100644
--- a/impl/activation/softmax/v220/softmax_common_impl.h
+++ b/impl/activation/softmax/v220/softmax_common_impl.h
@@ -415,7 +415,7 @@ __aicore__ inline void NewReduceSumLastNDImpl(const LocalTensor<float>& dst, con
 }
 
 __aicore__ inline void ReduceMaxSingleBlockNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const uint64_t& mask, const ReduceLastND& reduceParam)
+    const uint64_t mask, const ReduceLastND& reduceParam)
 {
     const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES;
     const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES;
@@ -496,7 +496,7 @@ __aicore__ inline void ReduceMaxLastNZImpl(const LocalTensor<float>& tmpBuffer1,
 }
 
 __aicore__ inline void ReduceSumSingleBlockNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const uint64_t& mask, const ReduceLastND& reduceParam)
+    const uint64_t mask, const ReduceLastND& reduceParam)
 {
     const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES;
     const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES;
diff --git a/impl/activation/softmax/v220/softmax_flashv2_impl.h b/impl/activation/softmax/v220/softmax_flashv2_impl.h
index 4ae32f6f..0c136543 100644
--- a/impl/activation/softmax/v220/softmax_flashv2_impl.h
+++ b/impl/activation/softmax/v220/softmax_flashv2_impl.h
@@ -21,8 +21,8 @@ namespace AscendC {
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& expMaxTensor,
     const LocalTensor<float>& inSumTensor, const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16
@@ -119,8 +119,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<float>& dst,
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<half>& inSumTensor, const LocalTensor<half>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -213,8 +213,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst,
 __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<float>& inSumTensor, const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16
@@ -763,8 +763,8 @@ __aicore__ inline void SoftmaxFlashV2BasicBlockImpl(const LocalTensor<float>& ds
 __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, const LocalTensor<half>& expSumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<half>& inExpSumTensor, const LocalTensor<half>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
@@ -834,7 +834,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<float>& dst, c
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& expMaxTensor,
     const LocalTensor<float>& inExpSumTensor, const LocalTensor<float>& inMaxTensor,
     const LocalTensor<float>& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling,
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.reduceSize]; // need splitM * 64
@@ -1018,8 +1018,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateBasicBlock(const LocalTensor<float>
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst, const LocalTensor<half>& expSumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& reduceBuffer = workLocal[tiling.splitSize];
@@ -1043,8 +1043,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst,
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<float>& dst, const LocalTensor<float>& expSumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
 
@@ -1248,7 +1248,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, co
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<float>& inExpSumTensor, const LocalTensor<float>& inMaxTensor,
     const LocalTensor<float>& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling,
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
@@ -1288,8 +1288,8 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, co
 
 __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor<half>& dst, const LocalTensor<float>& expSumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v220/softmax_grad_impl.h b/impl/activation/softmax/v220/softmax_grad_impl.h
index ac12aa9d..55dc5739 100644
--- a/impl/activation/softmax/v220/softmax_grad_impl.h
+++ b/impl/activation/softmax/v220/softmax_grad_impl.h
@@ -20,7 +20,7 @@
 namespace AscendC {
 __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& gradTensor,
     const LocalTensor<half>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -60,8 +60,8 @@ __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<half>& ds
 
 __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor<float>& dst,
     const LocalTensor<float>& gradTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -257,7 +257,7 @@ __aicore__ inline void SoftmaxGradFrontNDImpl(const LocalTensor<T>& dstTensor, c
 
 __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& gradTensor,
     const LocalTensor<half>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -317,7 +317,7 @@ __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<half>& dst, co
 
 __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& gradTensor,
     const LocalTensor<float>& src, const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2],
-    const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h
index b3433528..55834331 100644
--- a/impl/activation/softmax/v220/softmax_impl.h
+++ b/impl/activation/softmax/v220/softmax_impl.h
@@ -21,8 +21,8 @@ namespace AscendC {
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -110,8 +110,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT;
     const uint32_t splitNZBlockCount = tiling.srcK / SOFTMAX_SHAPE_NZ_BASIC_COUNT;
@@ -189,8 +189,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<float>& dst, const
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitCount, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitCount, const ReduceLastND& reduceParam)
 {
     LocalTensor<float> tmpBuffer0 = workLocal;
     LocalTensor<float> tmpBuffer1 = workLocal[tiling.splitSize];
@@ -504,8 +504,8 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor<float>& dst, const Lo
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal; // need splitM * 64
 
@@ -526,8 +526,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
@@ -670,7 +670,7 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor<half>& dst, const Loc
 
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
@@ -733,7 +733,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTe
 
 template <bool isReuseSource = false>
 __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<half>& dst, const LocalTensor<half>& src,
-    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize,
+    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
@@ -760,7 +760,7 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<half>& dst, const Loc
 
 template <bool isReuseSource = false>
 __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize,
+    const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize,
     const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
diff --git a/impl/activation/softmax/v300/softmax_flashv2_impl.h b/impl/activation/softmax/v300/softmax_flashv2_impl.h
index 7c3e80b8..871e5536 100644
--- a/impl/activation/softmax/v300/softmax_flashv2_impl.h
+++ b/impl/activation/softmax/v300/softmax_flashv2_impl.h
@@ -60,8 +60,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdate(const LocalTensor<T1>& dst, const
 __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<half>& expMaxTensor,
     const LocalTensor<half>& inSumTensor, const LocalTensor<half>& inMaxTensor, const LocalTensor<float>& workLocal,
-    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2,
-    const uint32_t& splitSize, const uint32_t& reduceSize)
+    const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2,
+    const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;                // for src cast
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.splitSize]; // need splitM * 64
@@ -96,7 +96,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor<float>& dst, c
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& expMaxTensor,
     const LocalTensor<float>& inExpSumTensor, const LocalTensor<float>& inMaxTensor,
     const LocalTensor<float>& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling,
-    const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize)
+    const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const LocalTensor<float>& tmpBuffer1 = workLocal[tiling.reduceSize]; // tiling.splitM * FLOAT_REPEAT_SIZE
diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h
index ea7699a7..82a55564 100644
--- a/impl/activation/softmax/v300/softmax_impl.h
+++ b/impl/activation/softmax/v300/softmax_impl.h
@@ -28,8 +28,8 @@ __aicore__ inline void SoftMaxNZImpl(const LocalTensor<T1>& dst, const LocalTens
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& sumTensor,
     const LocalTensor<half>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal[0];
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
@@ -55,8 +55,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
 template <bool isFlashV2 = false>
 __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
-    const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize,
-    const uint32_t& reduceSize, const ReduceLastND& reduceParam)
+    const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize,
+    const uint32_t reduceSize, const ReduceLastND& reduceParam)
 {
     const LocalTensor<float>& tmpBuffer1 = workLocal[0]; // need splitM * 64
 
diff --git a/impl/activation/swiglu/swiglu_common_impl.h b/impl/activation/swiglu/swiglu_common_impl.h
index 67663654..d956eda9 100644
--- a/impl/activation/swiglu/swiglu_common_impl.h
+++ b/impl/activation/swiglu/swiglu_common_impl.h
@@ -29,7 +29,7 @@ constexpr uint32_t SWIGLU_STRIDE_DIGITS = 2;
 
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-    const LocalTensor<T> &srcTensor1, const float &scalarValue, const uint32_t calCount)
+    const LocalTensor<T> &srcTensor1, const float scalarValue, const uint32_t calCount)
 {
     // Only for AI Vector Core.
     if ASCEND_IS_AIC {
@@ -44,7 +44,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T
 
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-    const LocalTensor<T> &srcTensor1, const float &scalarValue)
+    const LocalTensor<T> &srcTensor1, const float scalarValue)
 {
     // Only for AI Vector Core.
     if ASCEND_IS_AIC {
@@ -59,7 +59,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T
 
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, LocalTensor<T> &srcTensor0, LocalTensor<T> &srcTensor1,
-                              const float &scalarValue)
+                              const float scalarValue)
 {
     // Only for AI Vector Core.
     if ASCEND_IS_AIC {
@@ -74,7 +74,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, LocalTensor<T> &src
 
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-                              const LocalTensor<T> &srcTensor1, const float &scalarValue,
+                              const LocalTensor<T> &srcTensor1, const float scalarValue,
                               const LocalTensor<uint8_t> &sharedTmpBuffer, const uint32_t calCount)
 {
     // Only for AI Vector Core.
@@ -140,7 +140,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor<T> &dstTensor, const LocalTensor<T
 
 template <typename T>
 __aicore__ inline void SwishCalcSimplified(
-   const LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor, const float &scalarValue)
+   const LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor, const float scalarValue)
 {
     // swish(x) = x / (1 + e^(-βx))
     // x1 = 1 + e^(-βx)
@@ -162,7 +162,7 @@ __aicore__ inline void SwishCalcSimplified(
 
 template <typename T>
 __aicore__ inline void SwiGLUImpl(const LocalTensor<T> &dst, const LocalTensor<T> &src0, const LocalTensor<T> &src1,
-                                  const float &beta, const LocalTensor<float> &sharedTmpBuffer, uint32_t calCount)
+                                  const float beta, const LocalTensor<float> &sharedTmpBuffer, uint32_t calCount)
 {
     // Calculate dstTensor = Swish(srcTensor1)
     float scalar = static_cast<float>(static_cast<float>(-1.0) * static_cast<float>(beta));
@@ -176,7 +176,7 @@ __aicore__ inline void SwiGLUImpl(const LocalTensor<T> &dst, const LocalTensor<T
 
 template <>
 __aicore__ inline void SwiGLUImpl<half>(const LocalTensor<half> &dst, const LocalTensor<half> &src0,
-                                        const LocalTensor<half> &src1, const float &beta,
+                                        const LocalTensor<half> &src1, const float beta,
                                         const LocalTensor<float> &sharedTmpBuffer, uint32_t calCount)
 {
     LocalTensor<float> tmpSrc1FloatBuffer1 = sharedTmpBuffer;
diff --git a/impl/quantization/quant/ascend_quant_common_impl.h b/impl/quantization/quant/ascend_quant_common_impl.h
index cd57ccc8..2e0d5dc2 100644
--- a/impl/quantization/quant/ascend_quant_common_impl.h
+++ b/impl/quantization/quant/ascend_quant_common_impl.h
@@ -97,7 +97,7 @@ __aicore__ inline void IsQuantParamValid(const LocalTensor<int8_t>& dstTensor, c
 template<typename T>
 __aicore__ inline void IsQuantParamValid(const LocalTensor<int8_t>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const LocalTensor<T>& scaleTensor,
-    const T& offset, const uint32_t scaleCount, const uint32_t calCount)
+    const T offset, const uint32_t scaleCount, const uint32_t calCount)
 {
     ASCENDC_ASSERT((calCount <= srcTensor.GetSize()), {
         KERNEL_LOG(KERNEL_ERROR, "calCount is %u, which should not larger than srcTensor size %u.",
diff --git a/lib/activation/swiglu.h b/lib/activation/swiglu.h
index 0358304a..f5544942 100644
--- a/lib/activation/swiglu.h
+++ b/lib/activation/swiglu.h
@@ -34,7 +34,7 @@ namespace AscendC {
  */
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, LocalTensor<T> &srcTensor0, LocalTensor<T> &srcTensor1,
-                              const float &scalarValue)
+                              const float scalarValue)
 {
     // Only for AI Vector Core.
     if ASCEND_IS_AIC {
@@ -55,7 +55,7 @@ __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, LocalTensor<T> &srcTens
  */
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-    const LocalTensor<T> &srcTensor1, const float &scalarValue, const LocalTensor<uint8_t> &sharedTmpBuffer)
+    const LocalTensor<T> &srcTensor1, const float scalarValue, const LocalTensor<uint8_t> &sharedTmpBuffer)
 {
     SwiGLU<T, isReuseSource>(dstTensor, srcTensor0, srcTensor1, scalarValue, sharedTmpBuffer, srcTensor0.GetSize());
 }
@@ -71,7 +71,7 @@ __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, const LocalTensor<T> &s
  */
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-    const LocalTensor<T> &srcTensor1, const float &scalarValue, const uint32_t calCount)
+    const LocalTensor<T> &srcTensor1, const float scalarValue, const uint32_t calCount)
 {
     // Only for AI Vector Core.
     if ASCEND_IS_AIC {
@@ -93,7 +93,7 @@ __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, const LocalTensor<T> &s
  */
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void SwiGLU(LocalTensor<T> &dstTensor, const LocalTensor<T> &srcTensor0,
-                              const LocalTensor<T> &srcTensor1, const float &scalarValue,
+                              const LocalTensor<T> &srcTensor1, const float scalarValue,
                               const LocalTensor<uint8_t> &sharedTmpBuffer, const uint32_t calCount)
 {
     SwiGLUImpl<T, isReuseSource>(dstTensor, srcTensor0, srcTensor1, scalarValue, sharedTmpBuffer, calCount);
-- 
Gitee