From ef57262c0ddecd31c9b80db1cc5ccc8476650b31 Mon Sep 17 00:00:00 2001 From: wang-xiangX Date: Wed, 24 Jul 2024 15:46:43 +0800 Subject: [PATCH] solve const & --- .../activation/softmax/logsoftmax_base_impl.h | 56 +++++++++---------- impl/activation/softmax/softmax_common.h | 10 ++-- .../softmax/v200/simple_softmax_impl.h | 6 +- .../softmax/v200/softmax_common_impl.h | 6 +- .../softmax/v200/softmax_flashv2_impl.h | 32 +++++------ .../softmax/v200/softmax_grad_impl.h | 10 ++-- impl/activation/softmax/v200/softmax_impl.h | 26 ++++----- .../softmax/v220/simple_softmax_impl.h | 6 +- .../softmax/v220/softmax_common_impl.h | 4 +- .../softmax/v220/softmax_flashv2_impl.h | 32 +++++------ .../softmax/v220/softmax_grad_impl.h | 10 ++-- impl/activation/softmax/v220/softmax_impl.h | 26 ++++----- .../softmax/v300/softmax_flashv2_impl.h | 6 +- impl/activation/softmax/v300/softmax_impl.h | 8 +-- impl/activation/swiglu/swiglu_common_impl.h | 14 ++--- .../quant/ascend_quant_common_impl.h | 2 +- lib/activation/swiglu.h | 8 +-- 17 files changed, 131 insertions(+), 131 deletions(-) diff --git a/impl/activation/softmax/logsoftmax_base_impl.h b/impl/activation/softmax/logsoftmax_base_impl.h index 7d436140..9a801913 100644 --- a/impl/activation/softmax/logsoftmax_base_impl.h +++ b/impl/activation/softmax/logsoftmax_base_impl.h @@ -57,8 +57,8 @@ constexpr float SCALAR_NATURE_LOG_10 = 0.4342944819; __aicore__ inline void LogSoftMaxGenericNZImpl(LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { const UnaryRepeatParams unaryParams; const uint64_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT; @@ -72,8 +72,8 @@ __aicore__ inline void LogSoftMaxGenericNZImpl(LocalTensor& dst, const Lo } __aicore__ inline void LogSoftMaxGenericNZReduceMaxImpl(const LocalTensor& tmpBuffer0, - const LocalTensor& tmpBuffer1, const LocalTensor& maxTensor, const uint32_t& offset2, - const uint32_t& splitCount, uint64_t mask[2], const ReduceLastND& reduceParam) + const LocalTensor& tmpBuffer1, const LocalTensor& maxTensor, const uint32_t offset2, + const uint32_t splitCount, uint64_t mask[2], const ReduceLastND& reduceParam) { ReduceMaxLastNZImpl(tmpBuffer1, tmpBuffer0, mask, reduceParam); PipeBarrier(); @@ -84,10 +84,10 @@ __aicore__ inline void LogSoftMaxGenericNZReduceMaxImpl(const LocalTensor { 1, 1, HALF_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); } -__aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t& splitNZBlockCount, - const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t& splitOffset, - const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2], - const uint32_t& lastBlockMaskLen, const uint32_t& splitCount) +__aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t splitNZBlockCount, + const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t splitOffset, + const uint32_t lastSplitNZBlockOffset, uint64_t mask[2], + const uint32_t lastBlockMaskLen, const uint32_t splitCount) { for (uint32_t j = 0; j < splitNZBlockCount - 1; j++) { Sub(tmpBuffer0[splitOffset * j], tmpBuffer0[splitOffset * j], tmpBuffer1, MASK_PLACEHOLDER, 1, @@ -102,8 +102,8 @@ __aicore__ inline void LogSoftMaxGenericNZSubImpl(const uint32_t& splitNZBlockCo } __aicore__ inline void LogSoftMaxGenericNZReduceSumImpl(const LocalTensor& tmpBuffer0, - const LocalTensor& tmpBuffer1, const LocalTensor& sumTensor, const uint32_t& offset2, - const uint32_t& splitCount, uint64_t mask[2], const ReduceLastND& reduceParam) + const LocalTensor& tmpBuffer1, const LocalTensor& sumTensor, const uint32_t offset2, + const uint32_t splitCount, uint64_t mask[2], const ReduceLastND& reduceParam) { ReduceSumLastNZImpl(tmpBuffer1, tmpBuffer0, mask, reduceParam); PipeBarrier(); @@ -115,10 +115,10 @@ __aicore__ inline void LogSoftMaxGenericNZReduceSumImpl(const LocalTensor PipeBarrier(); } -__aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t& splitNZBlockCount, - const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t& splitOffset, - const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t& lastBlockMaskLen, - const uint32_t& splitCount) +__aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t splitNZBlockCount, + const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t splitOffset, + const uint32_t lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t lastBlockMaskLen, + const uint32_t splitCount) { for (uint32_t j = 0; j < splitNZBlockCount - 1; j++) { Div(tmpBuffer0[splitOffset * j], tmpBuffer0[splitOffset * j], tmpBuffer1, MASK_PLACEHOLDER, 1, @@ -130,9 +130,9 @@ __aicore__ inline void LogSoftMaxGenericNZDivImpl(const uint32_t& splitNZBlockCo mask, lastBlockMaskLen, splitCount, Div); } -__aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t& splitNZBlockCount, - const LocalTensor& tmpBuffer0, const LocalTensor& dst, const uint32_t& splitOffset, - const uint32_t& splitCount, const SoftMaxTiling& tiling, const uint32_t& offset1) +__aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t splitNZBlockCount, + const LocalTensor& tmpBuffer0, const LocalTensor& dst, const uint32_t splitOffset, + const uint32_t splitCount, const SoftMaxTiling& tiling, const uint32_t offset1) { SetMaskCount(); SetVectorMask(0, splitCount); @@ -154,10 +154,10 @@ __aicore__ inline void LogSoftMaxGenericNZLogImpl(const uint32_t& splitNZBlockCo ResetMask(); } -__aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t& splitNZBlockCount, - const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t& splitOffset, - const uint32_t& lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t& lastBlockMaskLen, - const uint32_t& splitCount) +__aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t splitNZBlockCount, + const LocalTensor& tmpBuffer0, const LocalTensor& tmpBuffer1, const uint32_t splitOffset, + const uint32_t lastSplitNZBlockOffset, uint64_t mask[2], const uint32_t lastBlockMaskLen, + const uint32_t splitCount) { SetMaskCount(); SetVectorMask(0, splitCount); @@ -176,8 +176,8 @@ __aicore__ inline void LogSoftMaxGenericNZExpImpl(const uint32_t& splitNZBlockCo __aicore__ inline void LogSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -287,8 +287,8 @@ __aicore__ inline void GenericLogNDImpl(const LocalTensor& dst, const Loc __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; // need splitM * 64 const UnaryRepeatParams unaryParams; @@ -314,8 +314,8 @@ __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor& dst, co __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; @@ -351,7 +351,7 @@ __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor& dst, con __aicore__ inline void LogSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const LogSoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, + const LogSoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h index dd7ea1a7..228fea59 100644 --- a/impl/activation/softmax/softmax_common.h +++ b/impl/activation/softmax/softmax_common.h @@ -362,7 +362,7 @@ __aicore__ inline void AlignedBrcbImpl(const LocalTensor& dstLocal, const Loc } __aicore__ inline void ContinusColumnBrcbImpl(const LocalTensor& dstLocal, const LocalTensor& srcLocal, - const uint32_t& repeat, const uint32_t& brcbCount) + const uint32_t repeat, const uint32_t brcbCount) { float scalarList[SCALAR_STACK_DEPTH] = {0}; SetVectorMask(brcbCount); @@ -393,7 +393,7 @@ __aicore__ inline void ContinusColumnBrcbImpl(const LocalTensor& dstLocal } __aicore__ inline void AlignedColumnBrcbImpl(const LocalTensor& dstLocal, const LocalTensor& srcLocal, - const uint32_t& repeat, const uint32_t& brcbCount) + const uint32_t repeat, const uint32_t brcbCount) { float scalarList[SCALAR_STACK_DEPTH] = {0}; SetVectorMask(brcbCount); @@ -474,7 +474,7 @@ __aicore__ inline void BroadCastLastCompute(const LocalTensor& dst, const Loc } } -__aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t& maskLen, const uint32_t& nzBlockCount) +__aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t maskLen, const uint32_t nzBlockCount) { // create mask in "01111111 11111111 01111111 11111111" format // maskLen is 1-15 @@ -494,7 +494,7 @@ __aicore__ inline void CreateSpecialFormatMask(uint64_t& lowMask, const uint32_t } __aicore__ inline void BinaryComputeWithSpecialMask(const LocalTensor& dst, const LocalTensor& src0, - const LocalTensor& src1, uint64_t mask[2], const uint32_t& lastBlockMaskLen, const uint32_t& splitCount, + const LocalTensor& src1, uint64_t mask[2], const uint32_t lastBlockMaskLen, const uint32_t splitCount, void (*func)(const LocalTensor&, const LocalTensor&, const LocalTensor&, uint64_t*, const uint8_t, const BinaryRepeatParams&)) { @@ -526,7 +526,7 @@ __aicore__ inline void BinaryComputeWithSpecialMask(const LocalTensor& ds } __aicore__ inline void UnaryComputeWithSpecialMask(const LocalTensor& dst, const LocalTensor& src, - uint64_t mask[2], const uint32_t& lastBlockMaskLen, const uint32_t& splitCount, + uint64_t mask[2], const uint32_t lastBlockMaskLen, const uint32_t splitCount, void (*func)(const LocalTensor&, const LocalTensor&, uint64_t*, const uint8_t, const UnaryRepeatParams&)) { diff --git a/impl/activation/softmax/v200/simple_softmax_impl.h b/impl/activation/softmax/v200/simple_softmax_impl.h index 85ddba56..06b514c5 100644 --- a/impl/activation/softmax/v200/simple_softmax_impl.h +++ b/impl/activation/softmax/v200/simple_softmax_impl.h @@ -20,7 +20,7 @@ namespace AscendC { __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT; LocalTensor tmpBuffer0 = workLocal; @@ -60,7 +60,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -110,7 +110,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v200/softmax_common_impl.h b/impl/activation/softmax/v200/softmax_common_impl.h index c28d129f..a4240166 100644 --- a/impl/activation/softmax/v200/softmax_common_impl.h +++ b/impl/activation/softmax/v200/softmax_common_impl.h @@ -190,7 +190,7 @@ __aicore__ inline void FirstBlockCopyImpl(const LocalTensor& dst, const L } __aicore__ inline void BroadCastImpl(const LocalTensor& dstLocal, const LocalTensor& srcLocal, - const uint32_t& repeat, const uint32_t& brcbCount) + const uint32_t repeat, const uint32_t brcbCount) { float scalarList[SCALAR_STACK_DEPTH] = {0}; SetVectorMask(brcbCount); @@ -389,7 +389,7 @@ __aicore__ inline void NewReduceSumLastNDImpl(const LocalTensor& dst, con } __aicore__ inline void ReduceMaxSingleBlockNZImpl(const LocalTensor& dst, const LocalTensor& src, - const uint64_t& mask, const ReduceLastND& reduceParam) + const uint64_t mask, const ReduceLastND& reduceParam) { const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES; const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES; @@ -470,7 +470,7 @@ __aicore__ inline void ReduceMaxLastNZImpl(const LocalTensor& tmpBuffer1, } __aicore__ inline void ReduceSumSingleBlockNZImpl(const LocalTensor& dst, const LocalTensor& src, - const uint64_t& mask, const ReduceLastND& reduceParam) + const uint64_t mask, const ReduceLastND& reduceParam) { const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES; const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES; diff --git a/impl/activation/softmax/v200/softmax_flashv2_impl.h b/impl/activation/softmax/v200/softmax_flashv2_impl.h index 0006ef1b..465b6e21 100644 --- a/impl/activation/softmax/v200/softmax_flashv2_impl.h +++ b/impl/activation/softmax/v200/softmax_flashv2_impl.h @@ -21,8 +21,8 @@ namespace AscendC { __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16 @@ -116,8 +116,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -209,8 +209,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16 @@ -768,8 +768,8 @@ __aicore__ inline void SoftmaxFlashV2BasicBlockImpl(const LocalTensor& ds __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; @@ -839,7 +839,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, c const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.reduceSize]; // need splitM * 64 @@ -1044,8 +1044,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateBasicBlock(const LocalTensor __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& reduceBuffer = workLocal[tiling.splitSize]; @@ -1069,8 +1069,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; @@ -1299,7 +1299,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, co const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; @@ -1339,8 +1339,8 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, co __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v200/softmax_grad_impl.h b/impl/activation/softmax/v200/softmax_grad_impl.h index 531ab4d7..b913c88d 100644 --- a/impl/activation/softmax/v200/softmax_grad_impl.h +++ b/impl/activation/softmax/v200/softmax_grad_impl.h @@ -20,7 +20,7 @@ namespace AscendC { __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -60,8 +60,8 @@ __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& ds __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -268,7 +268,7 @@ __aicore__ inline void SoftmaxGradFrontNDImpl(const LocalTensor& dstTensor, c __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -328,7 +328,7 @@ __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, co __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h index 2f76d21e..7e34f855 100644 --- a/impl/activation/softmax/v200/softmax_impl.h +++ b/impl/activation/softmax/v200/softmax_impl.h @@ -21,8 +21,8 @@ namespace AscendC { template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -110,8 +110,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT; const uint32_t splitNZBlockCount = tiling.srcK / SOFTMAX_SHAPE_NZ_BASIC_COUNT; @@ -189,8 +189,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -524,8 +524,8 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor& dst, const Lo __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; // need splitM * 64 @@ -546,8 +546,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; @@ -707,7 +707,7 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor& dst, const Loc __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; @@ -771,7 +771,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTe } __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, - const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize, + const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; @@ -797,7 +797,7 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Loc } __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, - const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize, + const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; diff --git a/impl/activation/softmax/v220/simple_softmax_impl.h b/impl/activation/softmax/v220/simple_softmax_impl.h index 5bbe7d10..60a70d31 100644 --- a/impl/activation/softmax/v220/simple_softmax_impl.h +++ b/impl/activation/softmax/v220/simple_softmax_impl.h @@ -20,7 +20,7 @@ namespace AscendC { __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT; LocalTensor tmpBuffer0 = workLocal; @@ -60,7 +60,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -110,7 +110,7 @@ __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, __aicore__ inline void SimpleSoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v220/softmax_common_impl.h b/impl/activation/softmax/v220/softmax_common_impl.h index 6dbc213c..67c64b17 100644 --- a/impl/activation/softmax/v220/softmax_common_impl.h +++ b/impl/activation/softmax/v220/softmax_common_impl.h @@ -415,7 +415,7 @@ __aicore__ inline void NewReduceSumLastNDImpl(const LocalTensor& dst, con } __aicore__ inline void ReduceMaxSingleBlockNZImpl(const LocalTensor& dst, const LocalTensor& src, - const uint64_t& mask, const ReduceLastND& reduceParam) + const uint64_t mask, const ReduceLastND& reduceParam) { const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES; const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES; @@ -496,7 +496,7 @@ __aicore__ inline void ReduceMaxLastNZImpl(const LocalTensor& tmpBuffer1, } __aicore__ inline void ReduceSumSingleBlockNZImpl(const LocalTensor& dst, const LocalTensor& src, - const uint64_t& mask, const ReduceLastND& reduceParam) + const uint64_t mask, const ReduceLastND& reduceParam) { const uint32_t range = reduceParam.srcM / MAX_REPEAT_TIMES; const uint32_t tail = reduceParam.srcM % MAX_REPEAT_TIMES; diff --git a/impl/activation/softmax/v220/softmax_flashv2_impl.h b/impl/activation/softmax/v220/softmax_flashv2_impl.h index 4ae32f6f..0c136543 100644 --- a/impl/activation/softmax/v220/softmax_flashv2_impl.h +++ b/impl/activation/softmax/v220/softmax_flashv2_impl.h @@ -21,8 +21,8 @@ namespace AscendC { __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16 @@ -119,8 +119,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -213,8 +213,8 @@ __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, __aicore__ inline void FlashV2NZUpdateGenericImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; // default len splitM * 16 @@ -763,8 +763,8 @@ __aicore__ inline void SoftmaxFlashV2BasicBlockImpl(const LocalTensor& ds __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; @@ -834,7 +834,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, c const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.reduceSize]; // need splitM * 64 @@ -1018,8 +1018,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateBasicBlock(const LocalTensor __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& reduceBuffer = workLocal[tiling.splitSize]; @@ -1043,8 +1043,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; @@ -1248,7 +1248,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, co const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; @@ -1288,8 +1288,8 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, co __aicore__ inline void SoftmaxFlashV2NoUpdateImpl(const LocalTensor& dst, const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v220/softmax_grad_impl.h b/impl/activation/softmax/v220/softmax_grad_impl.h index ac12aa9d..55dc5739 100644 --- a/impl/activation/softmax/v220/softmax_grad_impl.h +++ b/impl/activation/softmax/v220/softmax_grad_impl.h @@ -20,7 +20,7 @@ namespace AscendC { __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -60,8 +60,8 @@ __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& ds __aicore__ inline void SoftMaxGradFrontGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -257,7 +257,7 @@ __aicore__ inline void SoftmaxGradFrontNDImpl(const LocalTensor& dstTensor, c __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -317,7 +317,7 @@ __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, co __aicore__ inline void SoftMaxGradGenericNZImpl(const LocalTensor& dst, const LocalTensor& gradTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, uint64_t mask[2], - const uint32_t& offset, const uint32_t& splitCount, const ReduceLastND& reduceParam) + const uint32_t offset, const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h index b3433528..55834331 100644 --- a/impl/activation/softmax/v220/softmax_impl.h +++ b/impl/activation/softmax/v220/softmax_impl.h @@ -21,8 +21,8 @@ namespace AscendC { template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -110,8 +110,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { const uint32_t splitOffset = tiling.splitM * SOFTMAX_SHAPE_NZ_BASIC_COUNT; const uint32_t splitNZBlockCount = tiling.srcK / SOFTMAX_SHAPE_NZ_BASIC_COUNT; @@ -189,8 +189,8 @@ __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const template __aicore__ inline void SoftMaxGenericNZImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitCount, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, uint64_t mask[2], const uint32_t offset1, const uint32_t offset2, + const uint32_t splitCount, const ReduceLastND& reduceParam) { LocalTensor tmpBuffer0 = workLocal; LocalTensor tmpBuffer1 = workLocal[tiling.splitSize]; @@ -504,8 +504,8 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor& dst, const Lo __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; // need splitM * 64 @@ -526,8 +526,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; @@ -670,7 +670,7 @@ __aicore__ inline void SoftMaxBasicBlock(const LocalTensor& dst, const Loc __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; @@ -733,7 +733,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTe template __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, - const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize, + const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; @@ -760,7 +760,7 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Loc template __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, - const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t& offset, const uint32_t& splitSize, + const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset, const uint32_t splitSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal; diff --git a/impl/activation/softmax/v300/softmax_flashv2_impl.h b/impl/activation/softmax/v300/softmax_flashv2_impl.h index 7c3e80b8..871e5536 100644 --- a/impl/activation/softmax/v300/softmax_flashv2_impl.h +++ b/impl/activation/softmax/v300/softmax_flashv2_impl.h @@ -60,8 +60,8 @@ __aicore__ inline void SoftmaxFlashV2NoUpdate(const LocalTensor& dst, const __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, - const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, - const uint32_t& splitSize, const uint32_t& reduceSize) + const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, + const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; // for src cast const LocalTensor& tmpBuffer1 = workLocal[tiling.splitSize]; // need splitM * 64 @@ -96,7 +96,7 @@ __aicore__ inline void SoftmaxFlashV2UpdateImpl(const LocalTensor& dst, c const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const ReduceLastND& reduceParam, const SoftMaxTiling& tiling, - const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, const uint32_t& reduceSize) + const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, const uint32_t reduceSize) { const LocalTensor& tmpBuffer0 = workLocal; const LocalTensor& tmpBuffer1 = workLocal[tiling.reduceSize]; // tiling.splitM * FLOAT_REPEAT_SIZE diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h index ea7699a7..82a55564 100644 --- a/impl/activation/softmax/v300/softmax_impl.h +++ b/impl/activation/softmax/v300/softmax_impl.h @@ -28,8 +28,8 @@ __aicore__ inline void SoftMaxNZImpl(const LocalTensor& dst, const LocalTens template __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer0 = workLocal[0]; const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; @@ -55,8 +55,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const template __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const SoftMaxTiling& tiling, const uint32_t& offset1, const uint32_t& offset2, const uint32_t& splitSize, - const uint32_t& reduceSize, const ReduceLastND& reduceParam) + const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t splitSize, + const uint32_t reduceSize, const ReduceLastND& reduceParam) { const LocalTensor& tmpBuffer1 = workLocal[0]; // need splitM * 64 diff --git a/impl/activation/swiglu/swiglu_common_impl.h b/impl/activation/swiglu/swiglu_common_impl.h index 67663654..d956eda9 100644 --- a/impl/activation/swiglu/swiglu_common_impl.h +++ b/impl/activation/swiglu/swiglu_common_impl.h @@ -29,7 +29,7 @@ constexpr uint32_t SWIGLU_STRIDE_DIGITS = 2; template __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue, const uint32_t calCount) + const LocalTensor &srcTensor1, const float scalarValue, const uint32_t calCount) { // Only for AI Vector Core. if ASCEND_IS_AIC { @@ -44,7 +44,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue) + const LocalTensor &srcTensor1, const float scalarValue) { // Only for AI Vector Core. if ASCEND_IS_AIC { @@ -59,7 +59,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, LocalTensor &srcTensor0, LocalTensor &srcTensor1, - const float &scalarValue) + const float scalarValue) { // Only for AI Vector Core. if ASCEND_IS_AIC { @@ -74,7 +74,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, LocalTensor &src template __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue, + const LocalTensor &srcTensor1, const float scalarValue, const LocalTensor &sharedTmpBuffer, const uint32_t calCount) { // Only for AI Vector Core. @@ -140,7 +140,7 @@ __aicore__ inline void SwiGLUImpl(LocalTensor &dstTensor, const LocalTensor __aicore__ inline void SwishCalcSimplified( - const LocalTensor &dstTensor, const LocalTensor &srcTensor, const float &scalarValue) + const LocalTensor &dstTensor, const LocalTensor &srcTensor, const float scalarValue) { // swish(x) = x / (1 + e^(-βx)) // x1 = 1 + e^(-βx) @@ -162,7 +162,7 @@ __aicore__ inline void SwishCalcSimplified( template __aicore__ inline void SwiGLUImpl(const LocalTensor &dst, const LocalTensor &src0, const LocalTensor &src1, - const float &beta, const LocalTensor &sharedTmpBuffer, uint32_t calCount) + const float beta, const LocalTensor &sharedTmpBuffer, uint32_t calCount) { // Calculate dstTensor = Swish(srcTensor1) float scalar = static_cast(static_cast(-1.0) * static_cast(beta)); @@ -176,7 +176,7 @@ __aicore__ inline void SwiGLUImpl(const LocalTensor &dst, const LocalTensor __aicore__ inline void SwiGLUImpl(const LocalTensor &dst, const LocalTensor &src0, - const LocalTensor &src1, const float &beta, + const LocalTensor &src1, const float beta, const LocalTensor &sharedTmpBuffer, uint32_t calCount) { LocalTensor tmpSrc1FloatBuffer1 = sharedTmpBuffer; diff --git a/impl/quantization/quant/ascend_quant_common_impl.h b/impl/quantization/quant/ascend_quant_common_impl.h index cd57ccc8..2e0d5dc2 100644 --- a/impl/quantization/quant/ascend_quant_common_impl.h +++ b/impl/quantization/quant/ascend_quant_common_impl.h @@ -97,7 +97,7 @@ __aicore__ inline void IsQuantParamValid(const LocalTensor& dstTensor, c template __aicore__ inline void IsQuantParamValid(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const LocalTensor& scaleTensor, - const T& offset, const uint32_t scaleCount, const uint32_t calCount) + const T offset, const uint32_t scaleCount, const uint32_t calCount) { ASCENDC_ASSERT((calCount <= srcTensor.GetSize()), { KERNEL_LOG(KERNEL_ERROR, "calCount is %u, which should not larger than srcTensor size %u.", diff --git a/lib/activation/swiglu.h b/lib/activation/swiglu.h index 0358304a..f5544942 100644 --- a/lib/activation/swiglu.h +++ b/lib/activation/swiglu.h @@ -34,7 +34,7 @@ namespace AscendC { */ template __aicore__ inline void SwiGLU(LocalTensor &dstTensor, LocalTensor &srcTensor0, LocalTensor &srcTensor1, - const float &scalarValue) + const float scalarValue) { // Only for AI Vector Core. if ASCEND_IS_AIC { @@ -55,7 +55,7 @@ __aicore__ inline void SwiGLU(LocalTensor &dstTensor, LocalTensor &srcTens */ template __aicore__ inline void SwiGLU(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue, const LocalTensor &sharedTmpBuffer) + const LocalTensor &srcTensor1, const float scalarValue, const LocalTensor &sharedTmpBuffer) { SwiGLU(dstTensor, srcTensor0, srcTensor1, scalarValue, sharedTmpBuffer, srcTensor0.GetSize()); } @@ -71,7 +71,7 @@ __aicore__ inline void SwiGLU(LocalTensor &dstTensor, const LocalTensor &s */ template __aicore__ inline void SwiGLU(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue, const uint32_t calCount) + const LocalTensor &srcTensor1, const float scalarValue, const uint32_t calCount) { // Only for AI Vector Core. if ASCEND_IS_AIC { @@ -93,7 +93,7 @@ __aicore__ inline void SwiGLU(LocalTensor &dstTensor, const LocalTensor &s */ template __aicore__ inline void SwiGLU(LocalTensor &dstTensor, const LocalTensor &srcTensor0, - const LocalTensor &srcTensor1, const float &scalarValue, + const LocalTensor &srcTensor1, const float scalarValue, const LocalTensor &sharedTmpBuffer, const uint32_t calCount) { SwiGLUImpl(dstTensor, srcTensor0, srcTensor1, scalarValue, sharedTmpBuffer, calCount); -- Gitee