diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake index 5c5a256621cfdf4b7299e6d9d51b736ae82454d9..376dc2ba07493f721de1273b7ced06d4bd22223d 100644 --- a/cmake/kernel_headers.cmake +++ b/cmake/kernel_headers.cmake @@ -82,7 +82,7 @@ file(CREATE_LINK ../normalization/kernel_operator_layernormgradbeta_intf.h file(CREATE_LINK ../normalization/layernormgradbeta.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/layernormgrad/layernormgradbeta.h SYMBOLIC) -# layernorm +# matmul file(MAKE_DIRECTORY ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix) file(CREATE_LINK ../matmul ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix/matmul SYMBOLIC) file(CREATE_LINK matmul/matmul_intf.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matmul_intf.h SYMBOLIC) diff --git a/docs/README.md b/docs/README.md index 9c5a21c79c6adc3df83835375ae99fe22548eed8..aaa4830cc5c3c4ae9a359122783b22b5de41b216 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,7 +22,7 @@ 按元素做量化计算,比如将half/float数据类型量化为int8_t数据类型。 - 数据归一化 + 数据归一化 BatchNorm 对于每个batch中的样本,对其输入的每个特征在batch的维度上进行归一化。 @@ -46,6 +46,10 @@ RmsNorm 实现对shape大小为[B,S,H]的输入数据的RmsNorm归一化。 + + GroupNorm + 对输入数据在 channel 维度进行分组并对每个组做归一化的方法。 + 激活函数 AdjustSoftMaxRes diff --git a/impl/normalization/groupnorm/groupnorm_common_impl.h b/impl/normalization/groupnorm/groupnorm_common_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..c8dd23e0c1c63b684f34bacee87d91201e27dc88 --- /dev/null +++ b/impl/normalization/groupnorm/groupnorm_common_impl.h @@ -0,0 +1,467 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file groupnorm_common_impl.h + * \brief + */ + +#ifndef IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H +#define IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H + +#include "kernel_tensor.h" +#include "kernel_pop_stack_buffer.h" +#include "kernel_tiling/kernel_tiling.h" + +namespace AscendC { + namespace { + constexpr uint32_t GROUPNORM_MASK_MAX_VAL = 64; + constexpr uint32_t GROUPNORM_MASK_SMALLEST_VAL = 8; + constexpr uint32_t GROUPNORM_MASK_STEP_VAL = 8; + constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 8; + } // namespace + +template struct GroupNormParams +{ + __aicore__ GroupNormParams(){}; + LocalTensor tempTensorA; + LocalTensor tempTensorB; + LocalTensor tempTensorC; + LocalTensor meanTmpTensor; + LocalTensor varianceTmpTensor; +}; + +__aicore__ inline uint32_t GetGroupNormWholeReduceMask1(const GroupNormTiling& tiling) +{ + uint32_t mask1{0}; + if (tiling.dhwAlignSize > GROUPNORM_MASK_MAX_VAL) { + mask1 = GROUPNORM_MASK_MAX_VAL; + while (mask1 != 0 && tiling.dhwAlignSize % mask1 != 0) { + mask1 -= GROUPNORM_MASK_STEP_VAL; + } + return mask1; + } + return tiling.dhwAlignSize; +} + +__aicore__ inline void GetGroupNormOutputMean(const LocalTensor& x_in, + const LocalTensor& tmp, const LocalTensor& mean, + const GroupNormTiling& tiling) +{ + for (uint32_t i = 0; i < tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.dhwAlignSize; + ReduceSum(mean[i], x_in[buffIndex], tmp[buffIndex], tiling.dhwAlignSize); + } + PipeBarrier(); + + Muls(mean, mean, tiling.factor, tiling.bsCurLength); + + // mean will be used to GetValue() to get scalar value + auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIdVToS); + WaitFlag(eventIdVToS); +} + +__aicore__ inline void GetGroupNormOutputVar(const LocalTensor& x_in, + const LocalTensor& tmp1, const LocalTensor& tmp2, + const LocalTensor& mean, const LocalTensor& var, const GroupNormTiling& tiling) +{ + for (uint32_t i = 0; i < tiling.d * tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.hwAlignSize; + Adds(tmp1[buffIndex], x_in[buffIndex], -1.0f * mean.GetValue(i / tiling.d), tiling.hw); + } + PipeBarrier(); + + Mul(tmp2, tmp1, tmp1, tiling.bshCurLength); + PipeBarrier(); + + for (uint32_t i = 0; i < tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.dhwAlignSize; + ReduceSum(var[i], tmp2[buffIndex], tmp2[buffIndex], tiling.dhwAlignSize); + } + PipeBarrier(); + + Muls(var, var, tiling.factor, tiling.bsCurLength); + PipeBarrier(); +} + +__aicore__ inline void GetGroupNormOutputPre(const LocalTensor& inout, + const LocalTensor& tmp, const LocalTensor& variance, + const GroupNormTiling& tiling, const float epsilon) +{ + Adds(tmp, variance, epsilon, tiling.bsCurLength); + PipeBarrier(); + Ln(tmp, tmp, tiling.bsCurLength); + PipeBarrier(); + // Multiply by -0.5f to convert the logarithmic result to the logarithm of the reciprocal of the standard deviation + Muls(tmp, tmp, -0.5f, tiling.bsCurLength); + PipeBarrier(); + Exp(tmp, tmp, tiling.bsCurLength); + + auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIdVToS); + WaitFlag(eventIdVToS); + + // pre norm + for (uint32_t i = 0; i < tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.dhwAlignSize; + Muls(inout[buffIndex], inout[buffIndex], tmp.GetValue(i), tiling.dhwAlignSize); + } + + // tmp will be written later + auto eventIdSToV = GetTPipePtr()->FetchEventID(HardEvent::S_V); + SetFlag(eventIdSToV); + WaitFlag(eventIdSToV); + + PipeBarrier(); +} + +__aicore__ inline void GetGroupNormOutput(const LocalTensor& inout, + const LocalTensor& gamma, const LocalTensor& beta, + const GroupNormTiling& tiling, const int32_t loopCount) +{ + size_t channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d; + for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) { + Muls(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize], + gamma.GetValue(channelIndex % tiling.c), tiling.hw); + channelIndex += 1; + } + PipeBarrier(); + + channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d; + for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) { + Adds(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize], + beta.GetValue(channelIndex % tiling.c), tiling.hw); + channelIndex += 1; + } + PipeBarrier(); +} + +__aicore__ inline void GroupNormExe(const LocalTensor& inputX, + const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance, + const half epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount) +{ + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + Duplicate(tempTensorA, 0.0f, tiling.bshCurLength); + PipeBarrier(); + Cast(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize); + PipeBarrier(); + + GetGroupNormOutputMean(tempTensorB, tempTensorC, outputMean, tiling); + + GetGroupNormOutputVar(tempTensorB, tempTensorB, tempTensorC, outputMean, outputVariance, tiling); + + GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast(epsilon)); + + Cast(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c); + PipeBarrier(); + Cast(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c); + PipeBarrier(); + + GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount); + + Cast(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize); + PipeBarrier(); +} + + +__aicore__ inline void GroupNormExe(const LocalTensor& inputX, + const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance, + const float epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount) +{ + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + + GetGroupNormOutputMean(inputX, output, outputMean, tiling); + + Duplicate(output, 0.0f, tiling.bshCurLength); + PipeBarrier(); + + GetGroupNormOutputVar(inputX, output, tempTensorC, outputMean, outputVariance, tiling); + + GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon); + + GetGroupNormOutput(output, gamma, beta, tiling, loopCount); +} + +__aicore__ inline void GroupNormExeSmallShape(const LocalTensor& inputX, + const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance, + const half epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount) +{ + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize * tiling.numberOfTmpBuf); + PipeBarrier(); + + Cast(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize); + PipeBarrier(); + + uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling); + ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); }); + + uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize; + uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL; + PipeBarrier(); + + WholeReduceSum(tempTensorC, tempTensorB, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + WholeReduceSum(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength); + auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIdVToS); + WaitFlag(eventIdVToS); + + for (uint32_t i = 0; i < tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.dhwAlignSize; + Adds(tempTensorB[buffIndex], tempTensorB[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d, + {1, 1, static_cast(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE), static_cast(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE)}); + } + PipeBarrier(); + + Mul(tempTensorC, tempTensorB, tempTensorB, tiling.bshCurLength); + PipeBarrier(); + + WholeReduceSum(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + WholeReduceSum(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength); + PipeBarrier(); + + GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast(epsilon)); + + Cast(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c); + PipeBarrier(); + Cast(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c); + PipeBarrier(); + + GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount); + + Cast(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize); + PipeBarrier(); +} + +__aicore__ inline void GroupNormExeSmallShape(const LocalTensor& inputX, + const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance, + const float epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount) +{ + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + Duplicate(output, 0.0f, tiling.inputRoundSize); + PipeBarrier(); + Duplicate(tempTensorC, 0.0f, tiling.inputRoundSize); + PipeBarrier(); + uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling); + ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); }); + + uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize; + uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL; + PipeBarrier(); + + WholeReduceSum(tempTensorC, inputX, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + WholeReduceSum(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength); + auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S); + SetFlag(eventIdVToS); + WaitFlag(eventIdVToS); + + auto repeatStride = tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE; + for (uint32_t i = 0; i < tiling.bsCurLength; ++i) { + uint32_t buffIndex = i * tiling.dhwAlignSize; + Adds(output[buffIndex], inputX[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d, + {1, 1, static_cast(repeatStride), static_cast(repeatStride)}); + } + PipeBarrier(); + + Mul(tempTensorC, output, output, tiling.bshCurLength); + PipeBarrier(); + + Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize); + PipeBarrier(); + + WholeReduceSum(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + WholeReduceSum(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL); + PipeBarrier(); + + Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength); + PipeBarrier(); + GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon); + + GetGroupNormOutput(output, gamma, beta, tiling, loopCount); +} + +template +__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor& inputX, + const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& stackBuffer, const GroupNormTiling& tiling, GroupNormParams& params) +{ + params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; + params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos]; + params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos]; + + ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)", + stackBuffer.GetSize(), tiling.tmpBufSize); + }); +} + +template +__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor& inputX, + const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& stackBuffer, const GroupNormTiling& tiling, GroupNormParams& params) +{ + params.meanTmpTensor = outputMean; + params.varianceTmpTensor = outputVariance; + + if constexpr (isReuseSource) { + params.tempTensorA = inputX; + params.tempTensorB = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.secondTmpStartPos]; + + ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + } else { + params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; + + ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + } + + ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)", + stackBuffer.GetSize(), tiling.tmpBufSize); + }); +} + +__aicore__ inline void GetOutputMeanVariance(const LocalTensor& outputMean, + const LocalTensor& outputVariance, const GroupNormTiling& tiling, const GroupNormParams& params) +{ + Cast(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g); + Cast(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g); +} + +template +__aicore__ inline void GroupNormNDCommon(const LocalTensor& inputX, + const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance, + const T epsilon, GroupNormTiling& tiling, const GroupNormParams& params) +{ + uint32_t inputOffset = 0; + uint32_t mvOffset = 0; + + for (uint32_t index = 0; index < tiling.loopRound; index++) { + if (tiling.smallShape) { + GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset], + params.meanTmpTensor[mvOffset], + params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index); + } else { + GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset], + params.meanTmpTensor[mvOffset], + params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index); + } + + inputOffset += tiling.inputRoundSize; + mvOffset += tiling.meanVarRoundSize; + } + + if (tiling.inputTailSize > 0) { + tiling.bshCurLength = tiling.inputTailSize; + tiling.bsCurLength = tiling.meanVarTailSize; + + inputOffset = tiling.inputTailPos; + mvOffset = tiling.meanVarTailPos; + + if (tiling.smallShape) { + GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset], + params.meanTmpTensor[mvOffset], + params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound); + } else { + GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset], + params.meanTmpTensor[mvOffset], + params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound); + } + + // revert to normal round size from tail size, for the next iteration calculation + tiling.bshCurLength = tiling.inputRoundSize; + tiling.bsCurLength = tiling.meanVarRoundSize; + } + + if constexpr (sizeof(T) == sizeof(half)) { + GetOutputMeanVariance(outputMean, outputVariance, tiling, params); + } +} + +template +__aicore__ inline void GroupNormImpl(const LocalTensor& output, + const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& inputX, const LocalTensor& gamma, const LocalTensor& beta, + const LocalTensor& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling) +{ + ASCENDC_ASSERT((tiling.oneTmpSize > 0), { KERNEL_LOG(KERNEL_ERROR, "tiling.oneTmpSize must > 0!"); }); + + if ASCEND_IS_AIC { + return; + } + LocalTensor stackBuffer = sharedTmpBuffer.ReinterpretCast(); + ASCENDC_ASSERT((stackBuffer.GetSize() > 0),{ KERNEL_LOG(KERNEL_ERROR, "sharedTmpBuffer Size must > 0!"); }); + + GroupNormParams params; + GetGroupNormNDTensorInfo(inputX, outputMean, outputVariance, stackBuffer, tiling, params); + + GroupNormNDCommon(inputX, gamma, beta, output, outputMean, outputVariance, epsilon, tiling, params); +} + +template +__aicore__ inline void GroupNormImpl(const LocalTensor& output, + const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& inputX, const LocalTensor& gamma, const LocalTensor& beta, + const T epsilon, GroupNormTiling& tiling) +{ + LocalTensor sharedTmpBuffer; + bool ans = PopStackBuffer(sharedTmpBuffer); + ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); + + GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon, tiling); +} + +} // namespace AscendC +#endif // IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H \ No newline at end of file diff --git a/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..369df14cbb8295180504bec61b500aa4b0265ccc --- /dev/null +++ b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp @@ -0,0 +1,204 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file groupnorm_tiling_impl.cpp + * \brief + */ + +#include "lib/normalization/groupnorm_tiling.h" +#include "impl/host_log.h" + +namespace optiling { + REGISTER_TILING_DATA_CLASS(GroupNormTilingOpApi, GroupNormTiling); +} // namespace optiling +namespace AscendC { + namespace { + constexpr uint32_t GROUPNORM_SIZEOF_FLOAT = 4; + constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2; + constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 32; + constexpr uint32_t GROUPNORM_THREE_TIMES = 3; + constexpr uint32_t GROUPNORM_TWO_TIMES = 2; + constexpr uint32_t GROUPNORM_ONE_NUMBER = 1; + constexpr uint32_t GROUPNORM_ZERO_NUMBER = 0; + constexpr float GROUPNORM_ONE_FLOAT_VALUE = 1.0f; + + constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64; + constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8; + constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255; + constexpr uint32_t GROUPNORM_REDUCESUM1_DST_REPEAT_STRIDE = 8; + constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8; + constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64; + constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8; + + uint32_t GetGroupNormTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource, + uint32_t groupNum, const bool isMaxValue) + { + ASCENDC_HOST_ASSERT(typeSize > 0, return 0, "typeSize must be greater than 0."); + ASCENDC_HOST_ASSERT(groupNum > 0, return 0, "groupNum must be greater than 0."); + + std::vector shapeDims = srcShape.GetDims(); + const uint32_t n = static_cast(shapeDims[0]); + const uint32_t c = static_cast(shapeDims[1]); + const uint32_t h = static_cast(shapeDims[2]); + const uint32_t w = static_cast(shapeDims[3]); + ASCENDC_HOST_ASSERT(c != 0, return 0, "the value of c should not be zero!"); + uint32_t mvTmpLen = n * groupNum * sizeof(float); + uint32_t hwLen = h * w * typeSize; + uint32_t dhwLen = {0}; + + mvTmpLen = (mvTmpLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE * + GROUPNORM_ONE_BLK_SIZE; + + if (isMaxValue) { + dhwLen = n * c * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE * + GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float)); + } else { + dhwLen = c / groupNum * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE * + GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float)); + } + + if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) { + return GROUPNORM_TWO_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen; + } + return GROUPNORM_THREE_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen; + } + } // namespace name + + void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource, + const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue) + { + maxValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, true); + minValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, false); + } + + void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, + const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling) + { + ASCENDC_HOST_ASSERT(typeSize > 0, return, "typeSize must be greater than 0."); + ASCENDC_HOST_ASSERT(groupNum > 0, return, "groupNum must be greater than 0."); + + std::vector shapeDims = srcShape.GetDims(); + const uint32_t n = static_cast(shapeDims[0]); + const uint32_t c = static_cast(shapeDims[1]); + const uint32_t h = static_cast(shapeDims[2]); + const uint32_t w = static_cast(shapeDims[3]); + const uint32_t g = groupNum; + const uint32_t d = c / groupNum; + ASCENDC_HOST_ASSERT(c != 0, return, "the value of c should not be zero!"); + // HW 32B 对齐 + const uint32_t hwAlignSize = (typeSize * h * w + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / + GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / typeSize; + + const uint32_t dhwAlignSize = d * hwAlignSize; + + const uint32_t inputXSize = n * c * hwAlignSize; + const uint32_t meanVarSize = n * g; + + const uint32_t oneBlockNum = GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT; + const uint32_t meanTmpTensorSize = (meanVarSize + oneBlockNum - GROUPNORM_ONE_NUMBER) / oneBlockNum * oneBlockNum; + + uint32_t meanVarTotalSize = 2 * meanTmpTensorSize; + if (typeSize == GROUPNORM_SIZEOF_FLOAT) { + meanVarTotalSize = GROUPNORM_ZERO_NUMBER; + } + + uint32_t numberOfTmpBuf = GROUPNORM_THREE_TIMES; + if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) { + numberOfTmpBuf = GROUPNORM_TWO_TIMES; + } + + const uint32_t tmpBufSize = stackBufferSize / GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT; + uint32_t oneTmpSize = (tmpBufSize - meanVarTotalSize) / numberOfTmpBuf; + + // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group + uint32_t bsCurLength = oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * d * hwAlignSize) * + GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; + + // 判断是否满足 smallShape 计算 + uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE; + while ((dhwAlignSize / (GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT)) % k != 0) { + k--; + } + const bool smallShape = (hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && + (hwAlignSize * d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k); + + // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围 + if (smallShape) { + uint32_t mask1{GROUPNORM_MAX_MASK_VAL}; + if (dhwAlignSize > GROUPNORM_MAX_MASK_VAL) { + while (mask1 != 0 && dhwAlignSize % mask1 != 0) { + mask1 -= GROUPNORM_STEP_MASK_VAL; + } + } else { + mask1 = dhwAlignSize; + } + ASCENDC_HOST_ASSERT(mask1 > 0, return, "mask1 must be greater than 0."); + const uint32_t maxBsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (dhwAlignSize / mask1) / + GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; + if (maxBsCurLength < bsCurLength) { + bsCurLength = maxBsCurLength; + } + } + + if (typeSize == GROUPNORM_SIZEOF_HALF && bsCurLength * dhwAlignSize < c) { + return; + } + + oneTmpSize = bsCurLength * d * hwAlignSize; + + if (oneTmpSize > inputXSize) { + bsCurLength = meanVarSize; + oneTmpSize = inputXSize; + } + + ASCENDC_HOST_ASSERT((oneTmpSize != GROUPNORM_ZERO_NUMBER), return, "the oneTmpSize should not be zero!"); + if (oneTmpSize == GROUPNORM_ZERO_NUMBER) { + return; + } + + const uint32_t inputRoundSize = oneTmpSize; + const uint32_t inputTailSize = inputXSize % oneTmpSize; + + const uint32_t meanVarRoundSize = inputRoundSize / dhwAlignSize; + const uint32_t meanVarTailSize = inputTailSize / dhwAlignSize; + + tiling.set_n(n); + tiling.set_c(c); + tiling.set_hw(h * w); + tiling.set_g(g); + tiling.set_d(d); + tiling.set_hwAlignSize(hwAlignSize); + tiling.set_dhwAlignSize(dhwAlignSize); + tiling.set_inputXSize(inputXSize); + tiling.set_meanVarSize(meanVarSize); + tiling.set_numberOfTmpBuf(numberOfTmpBuf); + tiling.set_meanTmpTensorPos(GROUPNORM_ZERO_NUMBER); + tiling.set_meanTmpTensorSize(meanTmpTensorSize); + tiling.set_varianceTmpTensorPos(meanTmpTensorSize); + tiling.set_varianceTmpTensorSize(meanTmpTensorSize); + tiling.set_tmpBufSize(tmpBufSize); + tiling.set_oneTmpSize(oneTmpSize); + tiling.set_firstTmpStartPos(meanVarTotalSize); + tiling.set_secondTmpStartPos(meanVarTotalSize + oneTmpSize); + tiling.set_thirdTmpStartPos(meanVarTotalSize + GROUPNORM_TWO_TIMES * oneTmpSize); + tiling.set_loopRound(inputXSize / oneTmpSize); + tiling.set_inputRoundSize(inputRoundSize); + tiling.set_inputTailSize(inputTailSize); + tiling.set_inputTailPos(inputXSize - inputTailSize); + tiling.set_meanVarRoundSize(meanVarRoundSize); + tiling.set_meanVarTailSize(meanVarTailSize); + tiling.set_meanVarTailPos(meanVarSize - meanVarTailSize); + tiling.set_bshCurLength(inputRoundSize); + tiling.set_bsCurLength(bsCurLength); + tiling.set_factor(GROUPNORM_ONE_FLOAT_VALUE / (d * h * w)); + tiling.set_smallShape(smallShape); + } +} // namespace AscendC \ No newline at end of file diff --git a/lib/normalization/groupnorm.h b/lib/normalization/groupnorm.h new file mode 100644 index 0000000000000000000000000000000000000000..a80458c9e7c3b94bb698e985b2db18f9bc779469 --- /dev/null +++ b/lib/normalization/groupnorm.h @@ -0,0 +1,73 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file groupnorm.h + * \brief + */ + +#ifndef LIB_NORMALIZATION_GROUPNORM_H +#define LIB_NORMALIZATION_GROUPNORM_H +#if __CCE_AICORE__ == 220 + +#include "kernel_tensor.h" +#include "../../impl/normalization/groupnorm/groupnorm_common_impl.h" +#include "kernel_tiling/kernel_tiling.h" +namespace AscendC { +#pragma begin_pipe(V) +/*! + * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization. + * + * \note support data type: half and float + * + * \param [out] output, output LocalTensor, shape is [n, C, H, W] + * \param [out] outputMean, output LocalTensor, shape is [n, groupNum] + * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum] + * \param [in] inputX, input LocalTensor, shape is [n, C, H, W] + * \param [in] gamma, input LocalTensor, shape is [C] + * \param [in] beta, input LocalTensor, shape is [C] + * \param [in] sharedTmpBuffer, input local temporary Tensor + * \param [in] epsilon, weighting factor + * \param [in] tiling, groupnormtiling + */ +template +__aicore__ inline void GroupNorm(const LocalTensor& output, const LocalTensor& outputMean, + const LocalTensor& outputVariance, const LocalTensor& inputX, const LocalTensor& gamma, + const LocalTensor& beta, const LocalTensor& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling) +{ + GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon, + tiling); +} + +/*! + * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization. + * + * \note support data type: half and float + * + * \param [out] output, output LocalTensor, shape is [n, C, H, W] + * \param [out] outputMean, output LocalTensor, shape is [n, groupNum] + * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum] + * \param [in] inputX, input LocalTensor, shape is [n, C, H, W] + * \param [in] gamma, input LocalTensor, shape is [C] + * \param [in] beta, input LocalTensor, shape is [C] + * \param [in] epsilon, weighting factor + * \param [in] tiling, groupnormtiling + */ +template +__aicore__ inline void GroupNorm(const LocalTensor& output, const LocalTensor& outputMean, + const LocalTensor& outputVariance, const LocalTensor& inputX, const LocalTensor& gamma, + const LocalTensor& beta, const T epsilon, GroupNormTiling& tiling) +{ + GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, epsilon, tiling); +} +#pragma end_pipe +} // namespace AscendC +#endif +#endif // LIB_NORMALIZATION_GROUPNORM_H \ No newline at end of file diff --git a/lib/normalization/groupnorm_tiling.h b/lib/normalization/groupnorm_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..dc5e0da8f149729b58d34a29e6c26d41b9640727 --- /dev/null +++ b/lib/normalization/groupnorm_tiling.h @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file groupnorm_tiling.h + * \brief + */ + +#ifndef LIB_NORMALIZATION_GROUPNORM_TILING_H +#define LIB_NORMALIZATION_GROUPNORM_TILING_H +#include "graph/tensor.h" +#include "groupnorm_tilingdata.h" +namespace AscendC { +/*! + * \brief calculate max and min tmp buffer size for GroupNorm interface. + * \param [in] srcShape: input shape + * \param [in] typeSize: data type size: sizeof(TYPE) + * \param [in] isReuseSource: indicate whether to reuse source tensor. + * When enable isReuseSource, src tensor will be used as tmp buffer for calculation. + * \param [in] groupNum: number of groups to separate the channels into + * \param [out] maxValue: max size required for tmp buffer + * \param [out] minValue: min size required for tmp buffer + * \return flag for whether the tmp buffer size is calculated successfully + * If src shape is illegal for basic block, it will return false. + */ +void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource, + const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue); + +/*! + * \brief calculate tiling params for GroupNorm interface + * + * \note stackBufferSize should be greater than min tmpSize from GetGroupNormMaxMinTmpSize + * + * \param [in] srcShape input shape + * \param [in] stackBufferSize input stack buffer size in uint of Byte, used as tmp buffer size for tiling + * \param [in] typeSize data type size: sizeof(TYPE) + * \param [in] isReuseSource indicate whether intermediate variables can reuse the input memory + * \param [in] groupNum: number of groups to separate the channels into + * \param [out] tiling GroupNorm tiling + * \return flag for whether the tiling is calculated successfully + if src shape and origin src shape is illeagl or input stackBufferSize is not big enough, it will return false. + */ +void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, + const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling); +} +#endif // LIB_NORMALIZATION_GROUPNORM_TILING_H diff --git a/lib/normalization/groupnorm_tilingdata.h b/lib/normalization/groupnorm_tilingdata.h new file mode 100644 index 0000000000000000000000000000000000000000..0f12c3358ac2c787eefe1085afa2f44febbd99f9 --- /dev/null +++ b/lib/normalization/groupnorm_tilingdata.h @@ -0,0 +1,54 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file groupnorm_tilingdata.h + * \brief + */ + +#ifndef LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H +#define LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(GroupNormTiling) + TILING_DATA_FIELD_DEF(uint32_t, n); + TILING_DATA_FIELD_DEF(uint32_t, c); + TILING_DATA_FIELD_DEF(uint32_t, hw); + TILING_DATA_FIELD_DEF(uint32_t, g); + TILING_DATA_FIELD_DEF(uint32_t, d); + TILING_DATA_FIELD_DEF(uint32_t, hwAlignSize); + TILING_DATA_FIELD_DEF(uint32_t, dhwAlignSize); + TILING_DATA_FIELD_DEF(uint32_t, inputXSize); + TILING_DATA_FIELD_DEF(uint32_t, meanVarSize); + TILING_DATA_FIELD_DEF(uint32_t, numberOfTmpBuf); + TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorPos); + TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorSize); + TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorPos); + TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorSize); + TILING_DATA_FIELD_DEF(uint32_t, tmpBufSize); + TILING_DATA_FIELD_DEF(uint32_t, oneTmpSize); + TILING_DATA_FIELD_DEF(uint32_t, firstTmpStartPos); + TILING_DATA_FIELD_DEF(uint32_t, secondTmpStartPos); + TILING_DATA_FIELD_DEF(uint32_t, thirdTmpStartPos); + TILING_DATA_FIELD_DEF(uint32_t, loopRound); + TILING_DATA_FIELD_DEF(uint32_t, inputRoundSize); + TILING_DATA_FIELD_DEF(uint32_t, inputTailSize); + TILING_DATA_FIELD_DEF(uint32_t, inputTailPos); + TILING_DATA_FIELD_DEF(uint32_t, meanVarRoundSize); + TILING_DATA_FIELD_DEF(uint32_t, meanVarTailSize); + TILING_DATA_FIELD_DEF(uint32_t, meanVarTailPos); + TILING_DATA_FIELD_DEF(uint32_t, bshCurLength); + TILING_DATA_FIELD_DEF(uint32_t, bsCurLength); + TILING_DATA_FIELD_DEF(float, factor); + TILING_DATA_FIELD_DEF(bool, smallShape); +END_TILING_DATA_DEF; +} +#endif // LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6d2e7694d3ebae87d71c1eefb1cf935f55ce6e28..80d3de2f3926b0a84fc46dd652a930f87a3072b2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,6 +78,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/normalization/batchnorm/test_operator_batchnorm.cpp ${ASCENDC_TESTS_DIR}/normalization/deepnorm/test_operator_deepnorm.cpp ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernorm.cpp + ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_groupnorm.cpp # ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgrad.cpp ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgradbeta.cpp ${ASCENDC_TESTS_DIR}/normalization/rmsnorm/test_operator_rmsnorm.cpp diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66088fdea6caefc167e6b72252d3a325ab4e437f --- /dev/null +++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp @@ -0,0 +1,312 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file test_operator_groupnorm.cpp + * \brief + */ + +#include +#define private public +#define protect public +#include "kernel_operator.h" + +using namespace std; +using namespace AscendC; +constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2; + +constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64; +constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8; +constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255; +constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8; +constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64; +constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8; + +__aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo, const uint32_t stackBufferSize, + const uint32_t typeSize, const bool isReuseSource, const uint32_t groupNum, GroupNormTiling& tiling) +{ + + uint32_t n = inputShapeInfo.shape[0]; + uint32_t c = inputShapeInfo.shape[1]; + uint32_t h = inputShapeInfo.shape[2]; + uint32_t w = inputShapeInfo.shape[3]; + + tiling.n = n; + tiling.c = c; + tiling.hw = h * w; + tiling.g = groupNum; + tiling.d = c / tiling.g; + tiling.hwAlignSize = (typeSize * tiling.hw + ONE_BLK_SIZE - 1) / + ONE_BLK_SIZE * ONE_BLK_SIZE / typeSize; + tiling.dhwAlignSize = tiling.d * tiling.hwAlignSize; + + tiling.inputXSize = n * c * tiling.hwAlignSize; + tiling.meanVarSize = n * tiling.g; + + uint32_t oneBlockNum = ONE_BLK_SIZE / B32_BYTE_SIZE; + tiling.meanTmpTensorPos = 0; + tiling.meanTmpTensorSize = (tiling.meanVarSize + oneBlockNum - 1) / oneBlockNum * oneBlockNum; + tiling.varianceTmpTensorPos = tiling.meanTmpTensorSize; + tiling.varianceTmpTensorSize = tiling.meanTmpTensorSize; + + + uint32_t meanVarTotalSize = tiling.meanTmpTensorSize + tiling.varianceTmpTensorSize; + if (typeSize == B32_BYTE_SIZE) { + meanVarTotalSize = 0; + } + + tiling.numberOfTmpBuf = THREE_OF_STACK_BUFFER; + if (isReuseSource && (typeSize == B32_BYTE_SIZE)) { + tiling.numberOfTmpBuf = TWO_OF_STACK_BUFFER; + } + + tiling.tmpBufSize = stackBufferSize / ONE_BLK_SIZE * ONE_BLK_SIZE / B32_BYTE_SIZE; + tiling.oneTmpSize = (tiling.tmpBufSize - meanVarTotalSize) / tiling.numberOfTmpBuf; + + // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group + tiling.bsCurLength = tiling.oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * tiling.d * tiling.hwAlignSize) * + GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; + + // 判断是否满足 smallShape 计算 + uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE; + while ((tiling.dhwAlignSize / (ONE_BLK_SIZE / B32_BYTE_SIZE)) % k != 0) { + k--; + } + tiling.smallShape = (tiling.hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && + (tiling.hwAlignSize * tiling.d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k); + + // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围 + if (tiling.smallShape) { + uint32_t mask1{GROUPNORM_MAX_MASK_VAL}; + if (tiling.dhwAlignSize > GROUPNORM_MAX_MASK_VAL) { + while (tiling.dhwAlignSize % mask1 != 0) { + mask1 -= GROUPNORM_STEP_MASK_VAL; + } + } else { + mask1 = tiling.dhwAlignSize; + } + uint32_t max_bsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (tiling.dhwAlignSize / mask1) / + GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION; + if (max_bsCurLength < tiling.bsCurLength) { + tiling.bsCurLength = max_bsCurLength; + } + } + + if (typeSize == GROUPNORM_SIZEOF_HALF && tiling.bsCurLength * tiling.dhwAlignSize < c) { + return; + } + + tiling.oneTmpSize = tiling.bsCurLength * tiling.d * tiling.hwAlignSize; + + if (tiling.oneTmpSize > tiling.inputXSize) { + tiling.bsCurLength = tiling.meanVarSize; + tiling.oneTmpSize = tiling.inputXSize; + } + + if (tiling.oneTmpSize == 0) { + return; + } + + tiling.firstTmpStartPos = meanVarTotalSize; + tiling.secondTmpStartPos = tiling.firstTmpStartPos + tiling.oneTmpSize; + tiling.thirdTmpStartPos = tiling.secondTmpStartPos + tiling.oneTmpSize; + + tiling.loopRound = tiling.inputXSize / tiling.oneTmpSize; + + tiling.inputRoundSize = tiling.oneTmpSize; + tiling.inputTailSize = tiling.inputXSize % tiling.oneTmpSize; + tiling.inputTailPos = tiling.inputXSize - tiling.inputTailSize; + + tiling.meanVarRoundSize = tiling.inputRoundSize / tiling.dhwAlignSize; + tiling.meanVarTailSize = tiling.inputTailSize / tiling.dhwAlignSize; + tiling.meanVarTailPos = tiling.meanVarSize - tiling.meanVarTailSize; + + tiling.bshCurLength = tiling.inputRoundSize; + + tiling.factor = 1.0f / (tiling.d * tiling.hw); + cout << tiling.n << ", " << tiling.c << ", " << tiling.hw << ", " << tiling.g << ", " << tiling.hwAlignSize << endl; + cout << "inputXSize: " << tiling.inputXSize << endl; + cout << "meanVarSize: " << tiling.meanVarSize << endl; + cout << "numberOfTmpBuf: " << tiling.numberOfTmpBuf << endl; + cout << "meanTmpTensorPos: " << tiling.meanTmpTensorPos << endl; + cout << "varianceTmpTensorPos: " << tiling.varianceTmpTensorPos << endl; + cout << "oneTmpSize: " << tiling.oneTmpSize << endl; + cout << "firstTmpStartPos: " << tiling.firstTmpStartPos << endl; + cout << "thirdTmpStartPos: " << tiling.thirdTmpStartPos << endl; + cout << "bsCurLength: " << tiling.bsCurLength << endl; + cout << "bshCurLength: " << tiling.bshCurLength << endl; + cout << "factor: " << tiling.factor << endl; + cout << "hwAlignSize: " << tiling.hwAlignSize << endl; + cout << "smallShape: " << tiling.smallShape << endl; + +} +// __aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm, +// GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g) +template +__aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm, + uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g) +{ + dataType epsilon = 0.001; + DataFormat dataFormat = DataFormat::ND; + + GlobalTensor inputXGlobal; + GlobalTensor gammGlobal; + GlobalTensor betaGlobal; + GlobalTensor outputGlobal; + // GlobalTensor outputMeanGlobal; + // GlobalTensor outputVarianceGlobal; + + uint32_t bshLength = n*c*h*w; + uint32_t bsLength = g*n; + + inputXGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(inputXGm), bshLength); + gammGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(gammGm), c); + betaGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(betaGm), c); + + outputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputGm), bshLength); + // outputMeanGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputMeanGm), bsLength); + // outputVarianceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputVarianceGm), bsLength); + + TPipe pipe; + TQue inQueueX; + TQue inQueueGamma; + TQue inQueueBeta; + TQue outQueue; + // TQue outQueueMean; + // TQue outQueueVariance; + TBuf meanBuffer, varBuffer; + + uint32_t hwAlignSize = (sizeof(dataType) * h * w + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE / sizeof(dataType); + pipe.InitBuffer(inQueueX, 1, sizeof(dataType) * n * c * hwAlignSize); + pipe.InitBuffer(inQueueGamma, 1, (sizeof(dataType) * c + 31) / 32 * 32); + pipe.InitBuffer(inQueueBeta, 1, (sizeof(dataType) * c + 31) / 32 * 32); + pipe.InitBuffer(outQueue, 1, sizeof(dataType) * n * c * hwAlignSize); + // pipe.InitBuffer(outQueueMean, 1, (sizeof(dataType) * g * n + 31) / 32 * 32); + // pipe.InitBuffer(outQueueVariance, 1, (sizeof(dataType) * g * n + 31) / 32 * 32); + pipe.InitBuffer(meanBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32); + pipe.InitBuffer(varBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32); + + LocalTensor inputXLocal = inQueueX.AllocTensor(); + LocalTensor gammaLocal = inQueueGamma.AllocTensor(); + LocalTensor betaLocal = inQueueBeta.AllocTensor(); + LocalTensor outputLocal = outQueue.AllocTensor(); + // LocalTensor meanLocal = outQueueMean.AllocTensor(); + // LocalTensor varianceLocal = outQueueVariance.AllocTensor(); + LocalTensor meanLocal = meanBuffer.Get(); + LocalTensor varianceLocal = varBuffer.Get(); + + DataCopyParams copyParams{static_cast(n*c), static_cast(h*w*sizeof(dataType)), 0, 0}; + DataCopyPadParams padParams{true, 0, static_cast(hwAlignSize - h * w), 0}; + DataCopyPad(inputXLocal, inputXGlobal, copyParams, padParams); + // DataCopy(inputXLocal, inputXGlobal, bshLength); + DataCopyParams copyParamsGamma{1, static_cast(c*sizeof(dataType)), 0, 0}; + DataCopyPadParams padParamsGamma{false, 0, 0, 0}; + DataCopyPad(gammaLocal, gammGlobal, copyParamsGamma, padParamsGamma); + DataCopyPad(betaLocal, betaGlobal, copyParamsGamma, padParamsGamma); + + // DataCopy(gammaLocal, gammGlobal, c); + // DataCopy(betaLocal, betaGlobal, c); + PipeBarrier(); + + uint32_t stackBufferSize = 0; + { + LocalTensor stackBuffer; + bool ans = PopStackBuffer(stackBuffer); + stackBufferSize = stackBuffer.GetSize(); + } + + GroupNormTiling groupNormTiling; + uint32_t inputShape[4] = {n, c, h, w}; + ShapeInfo shapeInfo{ (uint8_t)4, inputShape, (uint8_t)4, inputShape, dataFormat }; + + GetGroupNormNDTillingInfo(shapeInfo, stackBufferSize, sizeof(dataType), isReuseSource, g, groupNormTiling); + + GroupNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal, + (dataType)epsilon, groupNormTiling); + PipeBarrier(); + + // DataCopy(outputGlobal, outputLocal, bshLength); + DataCopyPad(outputGlobal, outputLocal, copyParams); + // DataCopy(outputMeanGlobal, meanLocal, bsLength); + // DataCopy(outputVarianceGlobal, varianceLocal, bsLength); + + inQueueX.FreeTensor(inputXLocal); + inQueueGamma.FreeTensor(gammaLocal); + inQueueBeta.FreeTensor(betaLocal); + outQueue.FreeTensor(outputLocal); + // outQueueMean.FreeTensor(meanLocal); + // outQueueVariance.FreeTensor(varianceLocal); + PipeBarrier(); +} + +struct groupnormTestParams { + uint32_t n; + uint32_t c; + uint32_t h; + uint32_t w; + uint32_t g; + uint32_t typeSize; + void (*cal_func)(uint8_t*, uint8_t*, uint8_t*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); +}; + +class groupnormTestSuite : public testing::Test, public testing::WithParamInterface { +protected: + static void SetUpTestCase() + { + std::cout << "groupnormTestSuite SetUpTestCase" << std::endl; + } + static void TearDownTestCase() + { + std::cout << "groupnormTestSuite TearDownTestCase" << std::endl; + } + virtual void SetUp() {} + virtual void TearDown() {} +}; + +INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_groupnorm, groupnormTestSuite, + ::testing::Values( + groupnormTestParams { 2, 16, 8, 8, 4, sizeof(float), MainGroupnormTest }, + groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest }, + groupnormTestParams { 2, 16, 9, 9, 4, sizeof(float), MainGroupnormTest }, + groupnormTestParams { 2, 16, 9, 9, 4, sizeof(half), MainGroupnormTest }, + groupnormTestParams { 2, 16, 8, 8, 4, sizeof(float), MainGroupnormTest }, + groupnormTestParams { 2, 16, 9, 9, 4, sizeof(float), MainGroupnormTest }, + groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest }, + groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest } + )); + +TEST_P(groupnormTestSuite, GroupnormTestCase) +{ + auto param = GetParam(); + + uint32_t n = param.n; + uint32_t c = param.c; + uint32_t h = param.h; + uint32_t w = param.w; + uint32_t g = param.g; + uint32_t typeSize = param.typeSize; + + uint32_t bshLength = n * c * h * w; + uint32_t bsLength = n * c / g; + + uint8_t inputXGm[bshLength * typeSize] { 0x00 }; + uint8_t gammGm[c * typeSize] { 0x00 }; + uint8_t betaGm[c * typeSize] { 0x00 }; + + uint8_t outputGm[bshLength * typeSize] {0x00}; + // uint8_t outputMeanGm[bsLength * typeSize] {0x00}; + // uint8_t outputVarianceGm[bsLength * typeSize] {0x00}; + + param.cal_func(inputXGm, gammGm, betaGm, outputGm, n, c, h, w, g); + + for (int32_t i = 0; i < bshLength * typeSize; i++) { + EXPECT_EQ(outputGm[i], 0x00); + } +} \ No newline at end of file diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index ec41be3a6f16c1445dd5f234d25ca81221eccb0e..1013bdcac23b29ad29716d3d8e4321a551392789 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -1396,6 +1396,28 @@ TEST_F(TestTiling, TestLayernormTiling) EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); } +TEST_F(TestTiling, TestGroupnormTiling) +{ + const uint32_t stackBufferSize = 100 * 1024; + const uint32_t typeSize = 4; + const uint32_t groupNum = 4; + + std::vector shapeDims = { 16, 16, 8, 8}; + auto groupnormShape = ge::Shape(shapeDims); + const bool isReuseSource = false; + optiling::GroupNormTiling tilling; + + uint32_t minValue = 0; + uint32_t maxValue = 0; + + AscendC::GetGroupNormMaxMinTmpSize(groupnormShape, typeSize, isReuseSource, groupNum, maxValue, minValue); + EXPECT_EQ(maxValue, 3 * (16 * 16 * 8 * 8) * typeSize + 2 * groupNum * 16 * typeSize); + EXPECT_EQ(minValue, 3 * (16 / 4 * 8 * 8) * typeSize + 2 * groupNum * 16 * typeSize); + + AscendC::GetGroupNormNDTilingInfo(groupnormShape, stackBufferSize, typeSize, isReuseSource, groupNum, tilling); + EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float)); +} + TEST_F(TestTiling, TestRmsnormTiling) { constexpr uint32_t bLength = 4;