diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake
index 5c5a256621cfdf4b7299e6d9d51b736ae82454d9..376dc2ba07493f721de1273b7ced06d4bd22223d 100644
--- a/cmake/kernel_headers.cmake
+++ b/cmake/kernel_headers.cmake
@@ -82,7 +82,7 @@ file(CREATE_LINK ../normalization/kernel_operator_layernormgradbeta_intf.h
file(CREATE_LINK ../normalization/layernormgradbeta.h
${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/layernormgrad/layernormgradbeta.h SYMBOLIC)
-# layernorm
+# matmul
file(MAKE_DIRECTORY ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix)
file(CREATE_LINK ../matmul ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix/matmul SYMBOLIC)
file(CREATE_LINK matmul/matmul_intf.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matmul_intf.h SYMBOLIC)
diff --git a/docs/README.md b/docs/README.md
index 9c5a21c79c6adc3df83835375ae99fe22548eed8..aaa4830cc5c3c4ae9a359122783b22b5de41b216 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -22,7 +22,7 @@
按元素做量化计算,比如将half/float数据类型量化为int8_t数据类型。 |
- | 数据归一化 |
+ 数据归一化 |
BatchNorm |
对于每个batch中的样本,对其输入的每个特征在batch的维度上进行归一化。 |
@@ -46,6 +46,10 @@
RmsNorm |
实现对shape大小为[B,S,H]的输入数据的RmsNorm归一化。 |
+
+ | GroupNorm |
+ 对输入数据在 channel 维度进行分组并对每个组做归一化的方法。 |
+
| 激活函数 |
AdjustSoftMaxRes |
diff --git a/impl/normalization/groupnorm/groupnorm_common_impl.h b/impl/normalization/groupnorm/groupnorm_common_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8dd23e0c1c63b684f34bacee87d91201e27dc88
--- /dev/null
+++ b/impl/normalization/groupnorm/groupnorm_common_impl.h
@@ -0,0 +1,467 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_common_impl.h
+ * \brief
+ */
+
+#ifndef IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
+#define IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_pop_stack_buffer.h"
+#include "kernel_tiling/kernel_tiling.h"
+
+namespace AscendC {
+ namespace {
+ constexpr uint32_t GROUPNORM_MASK_MAX_VAL = 64;
+ constexpr uint32_t GROUPNORM_MASK_SMALLEST_VAL = 8;
+ constexpr uint32_t GROUPNORM_MASK_STEP_VAL = 8;
+ constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 8;
+ } // namespace
+
+template struct GroupNormParams
+{
+ __aicore__ GroupNormParams(){};
+ LocalTensor tempTensorA;
+ LocalTensor tempTensorB;
+ LocalTensor tempTensorC;
+ LocalTensor meanTmpTensor;
+ LocalTensor varianceTmpTensor;
+};
+
+__aicore__ inline uint32_t GetGroupNormWholeReduceMask1(const GroupNormTiling& tiling)
+{
+ uint32_t mask1{0};
+ if (tiling.dhwAlignSize > GROUPNORM_MASK_MAX_VAL) {
+ mask1 = GROUPNORM_MASK_MAX_VAL;
+ while (mask1 != 0 && tiling.dhwAlignSize % mask1 != 0) {
+ mask1 -= GROUPNORM_MASK_STEP_VAL;
+ }
+ return mask1;
+ }
+ return tiling.dhwAlignSize;
+}
+
+__aicore__ inline void GetGroupNormOutputMean(const LocalTensor& x_in,
+ const LocalTensor& tmp, const LocalTensor& mean,
+ const GroupNormTiling& tiling)
+{
+ for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.dhwAlignSize;
+ ReduceSum(mean[i], x_in[buffIndex], tmp[buffIndex], tiling.dhwAlignSize);
+ }
+ PipeBarrier();
+
+ Muls(mean, mean, tiling.factor, tiling.bsCurLength);
+
+ // mean will be used to GetValue() to get scalar value
+ auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+ SetFlag(eventIdVToS);
+ WaitFlag(eventIdVToS);
+}
+
+__aicore__ inline void GetGroupNormOutputVar(const LocalTensor& x_in,
+ const LocalTensor& tmp1, const LocalTensor& tmp2,
+ const LocalTensor& mean, const LocalTensor& var, const GroupNormTiling& tiling)
+{
+ for (uint32_t i = 0; i < tiling.d * tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.hwAlignSize;
+ Adds(tmp1[buffIndex], x_in[buffIndex], -1.0f * mean.GetValue(i / tiling.d), tiling.hw);
+ }
+ PipeBarrier();
+
+ Mul(tmp2, tmp1, tmp1, tiling.bshCurLength);
+ PipeBarrier();
+
+ for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.dhwAlignSize;
+ ReduceSum(var[i], tmp2[buffIndex], tmp2[buffIndex], tiling.dhwAlignSize);
+ }
+ PipeBarrier();
+
+ Muls(var, var, tiling.factor, tiling.bsCurLength);
+ PipeBarrier();
+}
+
+__aicore__ inline void GetGroupNormOutputPre(const LocalTensor& inout,
+ const LocalTensor& tmp, const LocalTensor& variance,
+ const GroupNormTiling& tiling, const float epsilon)
+{
+ Adds(tmp, variance, epsilon, tiling.bsCurLength);
+ PipeBarrier();
+ Ln(tmp, tmp, tiling.bsCurLength);
+ PipeBarrier();
+ // Multiply by -0.5f to convert the logarithmic result to the logarithm of the reciprocal of the standard deviation
+ Muls(tmp, tmp, -0.5f, tiling.bsCurLength);
+ PipeBarrier();
+ Exp(tmp, tmp, tiling.bsCurLength);
+
+ auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+ SetFlag(eventIdVToS);
+ WaitFlag(eventIdVToS);
+
+ // pre norm
+ for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.dhwAlignSize;
+ Muls(inout[buffIndex], inout[buffIndex], tmp.GetValue(i), tiling.dhwAlignSize);
+ }
+
+ // tmp will be written later
+ auto eventIdSToV = GetTPipePtr()->FetchEventID(HardEvent::S_V);
+ SetFlag(eventIdSToV);
+ WaitFlag(eventIdSToV);
+
+ PipeBarrier();
+}
+
+__aicore__ inline void GetGroupNormOutput(const LocalTensor& inout,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const GroupNormTiling& tiling, const int32_t loopCount)
+{
+ size_t channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d;
+ for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) {
+ Muls(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize],
+ gamma.GetValue(channelIndex % tiling.c), tiling.hw);
+ channelIndex += 1;
+ }
+ PipeBarrier();
+
+ channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d;
+ for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) {
+ Adds(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize],
+ beta.GetValue(channelIndex % tiling.c), tiling.hw);
+ channelIndex += 1;
+ }
+ PipeBarrier();
+}
+
+__aicore__ inline void GroupNormExe(const LocalTensor& inputX,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const half epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount)
+{
+ LocalTensor tempTensorA = params.tempTensorA;
+ LocalTensor tempTensorB = params.tempTensorB;
+ LocalTensor tempTensorC = params.tempTensorC;
+ Duplicate(tempTensorA, 0.0f, tiling.bshCurLength);
+ PipeBarrier();
+ Cast(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize);
+ PipeBarrier();
+
+ GetGroupNormOutputMean(tempTensorB, tempTensorC, outputMean, tiling);
+
+ GetGroupNormOutputVar(tempTensorB, tempTensorB, tempTensorC, outputMean, outputVariance, tiling);
+
+ GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast(epsilon));
+
+ Cast(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c);
+ PipeBarrier();
+ Cast(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c);
+ PipeBarrier();
+
+ GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount);
+
+ Cast(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize);
+ PipeBarrier();
+}
+
+
+__aicore__ inline void GroupNormExe(const LocalTensor& inputX,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const float epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount)
+{
+ LocalTensor tempTensorA = params.tempTensorA;
+ LocalTensor tempTensorB = params.tempTensorB;
+ LocalTensor tempTensorC = params.tempTensorC;
+
+ GetGroupNormOutputMean(inputX, output, outputMean, tiling);
+
+ Duplicate(output, 0.0f, tiling.bshCurLength);
+ PipeBarrier();
+
+ GetGroupNormOutputVar(inputX, output, tempTensorC, outputMean, outputVariance, tiling);
+
+ GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon);
+
+ GetGroupNormOutput(output, gamma, beta, tiling, loopCount);
+}
+
+__aicore__ inline void GroupNormExeSmallShape(const LocalTensor& inputX,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const half epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount)
+{
+ LocalTensor tempTensorA = params.tempTensorA;
+ LocalTensor tempTensorB = params.tempTensorB;
+ LocalTensor tempTensorC = params.tempTensorC;
+ Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize * tiling.numberOfTmpBuf);
+ PipeBarrier();
+
+ Cast(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize);
+ PipeBarrier();
+
+ uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling);
+ ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); });
+
+ uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize;
+ uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL;
+ PipeBarrier();
+
+ WholeReduceSum(tempTensorC, tempTensorB, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ WholeReduceSum(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength);
+ auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+ SetFlag(eventIdVToS);
+ WaitFlag(eventIdVToS);
+
+ for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.dhwAlignSize;
+ Adds(tempTensorB[buffIndex], tempTensorB[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d,
+ {1, 1, static_cast(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE), static_cast(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE)});
+ }
+ PipeBarrier();
+
+ Mul(tempTensorC, tempTensorB, tempTensorB, tiling.bshCurLength);
+ PipeBarrier();
+
+ WholeReduceSum(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ WholeReduceSum(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength);
+ PipeBarrier();
+
+ GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast(epsilon));
+
+ Cast(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c);
+ PipeBarrier();
+ Cast(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c);
+ PipeBarrier();
+
+ GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount);
+
+ Cast(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize);
+ PipeBarrier();
+}
+
+__aicore__ inline void GroupNormExeSmallShape(const LocalTensor& inputX,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const float epsilon, const GroupNormTiling& tiling, const GroupNormParams& params, const int32_t loopCount)
+{
+ LocalTensor tempTensorA = params.tempTensorA;
+ LocalTensor tempTensorB = params.tempTensorB;
+ LocalTensor tempTensorC = params.tempTensorC;
+ Duplicate(output, 0.0f, tiling.inputRoundSize);
+ PipeBarrier();
+ Duplicate(tempTensorC, 0.0f, tiling.inputRoundSize);
+ PipeBarrier();
+ uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling);
+ ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); });
+
+ uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize;
+ uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL;
+ PipeBarrier();
+
+ WholeReduceSum(tempTensorC, inputX, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ WholeReduceSum(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength);
+ auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+ SetFlag(eventIdVToS);
+ WaitFlag(eventIdVToS);
+
+ auto repeatStride = tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE;
+ for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+ uint32_t buffIndex = i * tiling.dhwAlignSize;
+ Adds(output[buffIndex], inputX[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d,
+ {1, 1, static_cast(repeatStride), static_cast(repeatStride)});
+ }
+ PipeBarrier();
+
+ Mul(tempTensorC, output, output, tiling.bshCurLength);
+ PipeBarrier();
+
+ Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize);
+ PipeBarrier();
+
+ WholeReduceSum(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ WholeReduceSum(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+ PipeBarrier();
+
+ Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength);
+ PipeBarrier();
+ GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon);
+
+ GetGroupNormOutput(output, gamma, beta, tiling, loopCount);
+}
+
+template
+__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor& inputX,
+ const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const LocalTensor& stackBuffer, const GroupNormTiling& tiling, GroupNormParams& params)
+{
+ params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+ params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+ params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+ params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos];
+ params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos];
+
+ ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+ KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+ tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+ });
+ ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+ KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)",
+ stackBuffer.GetSize(), tiling.tmpBufSize);
+ });
+}
+
+template
+__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor& inputX,
+ const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const LocalTensor& stackBuffer, const GroupNormTiling& tiling, GroupNormParams& params)
+{
+ params.meanTmpTensor = outputMean;
+ params.varianceTmpTensor = outputVariance;
+
+ if constexpr (isReuseSource) {
+ params.tempTensorA = inputX;
+ params.tempTensorB = stackBuffer[tiling.firstTmpStartPos];
+ params.tempTensorC = stackBuffer[tiling.secondTmpStartPos];
+
+ ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+ KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+ tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+ });
+ } else {
+ params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+ params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+ params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+
+ ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+ KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+ tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+ });
+ }
+
+ ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+ KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)",
+ stackBuffer.GetSize(), tiling.tmpBufSize);
+ });
+}
+
+__aicore__ inline void GetOutputMeanVariance(const LocalTensor& outputMean,
+ const LocalTensor& outputVariance, const GroupNormTiling& tiling, const GroupNormParams& params)
+{
+ Cast(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g);
+ Cast(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g);
+}
+
+template
+__aicore__ inline void GroupNormNDCommon(const LocalTensor& inputX,
+ const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& output, const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const T epsilon, GroupNormTiling& tiling, const GroupNormParams& params)
+{
+ uint32_t inputOffset = 0;
+ uint32_t mvOffset = 0;
+
+ for (uint32_t index = 0; index < tiling.loopRound; index++) {
+ if (tiling.smallShape) {
+ GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset],
+ params.meanTmpTensor[mvOffset],
+ params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index);
+ } else {
+ GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset],
+ params.meanTmpTensor[mvOffset],
+ params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index);
+ }
+
+ inputOffset += tiling.inputRoundSize;
+ mvOffset += tiling.meanVarRoundSize;
+ }
+
+ if (tiling.inputTailSize > 0) {
+ tiling.bshCurLength = tiling.inputTailSize;
+ tiling.bsCurLength = tiling.meanVarTailSize;
+
+ inputOffset = tiling.inputTailPos;
+ mvOffset = tiling.meanVarTailPos;
+
+ if (tiling.smallShape) {
+ GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset],
+ params.meanTmpTensor[mvOffset],
+ params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound);
+ } else {
+ GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset],
+ params.meanTmpTensor[mvOffset],
+ params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound);
+ }
+
+ // revert to normal round size from tail size, for the next iteration calculation
+ tiling.bshCurLength = tiling.inputRoundSize;
+ tiling.bsCurLength = tiling.meanVarRoundSize;
+ }
+
+ if constexpr (sizeof(T) == sizeof(half)) {
+ GetOutputMeanVariance(outputMean, outputVariance, tiling, params);
+ }
+}
+
+template
+__aicore__ inline void GroupNormImpl(const LocalTensor& output,
+ const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const LocalTensor& inputX, const LocalTensor& gamma, const LocalTensor& beta,
+ const LocalTensor& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling)
+{
+ ASCENDC_ASSERT((tiling.oneTmpSize > 0), { KERNEL_LOG(KERNEL_ERROR, "tiling.oneTmpSize must > 0!"); });
+
+ if ASCEND_IS_AIC {
+ return;
+ }
+ LocalTensor stackBuffer = sharedTmpBuffer.ReinterpretCast();
+ ASCENDC_ASSERT((stackBuffer.GetSize() > 0),{ KERNEL_LOG(KERNEL_ERROR, "sharedTmpBuffer Size must > 0!"); });
+
+ GroupNormParams params;
+ GetGroupNormNDTensorInfo(inputX, outputMean, outputVariance, stackBuffer, tiling, params);
+
+ GroupNormNDCommon(inputX, gamma, beta, output, outputMean, outputVariance, epsilon, tiling, params);
+}
+
+template
+__aicore__ inline void GroupNormImpl(const LocalTensor& output,
+ const LocalTensor& outputMean, const LocalTensor& outputVariance,
+ const LocalTensor& inputX, const LocalTensor& gamma, const LocalTensor& beta,
+ const T epsilon, GroupNormTiling& tiling)
+{
+ LocalTensor sharedTmpBuffer;
+ bool ans = PopStackBuffer(sharedTmpBuffer);
+ ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+
+ GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon, tiling);
+}
+
+} // namespace AscendC
+#endif // IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
\ No newline at end of file
diff --git a/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..369df14cbb8295180504bec61b500aa4b0265ccc
--- /dev/null
+++ b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp
@@ -0,0 +1,204 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tiling_impl.cpp
+ * \brief
+ */
+
+#include "lib/normalization/groupnorm_tiling.h"
+#include "impl/host_log.h"
+
+namespace optiling {
+ REGISTER_TILING_DATA_CLASS(GroupNormTilingOpApi, GroupNormTiling);
+} // namespace optiling
+namespace AscendC {
+ namespace {
+ constexpr uint32_t GROUPNORM_SIZEOF_FLOAT = 4;
+ constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2;
+ constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 32;
+ constexpr uint32_t GROUPNORM_THREE_TIMES = 3;
+ constexpr uint32_t GROUPNORM_TWO_TIMES = 2;
+ constexpr uint32_t GROUPNORM_ONE_NUMBER = 1;
+ constexpr uint32_t GROUPNORM_ZERO_NUMBER = 0;
+ constexpr float GROUPNORM_ONE_FLOAT_VALUE = 1.0f;
+
+ constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64;
+ constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8;
+ constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255;
+ constexpr uint32_t GROUPNORM_REDUCESUM1_DST_REPEAT_STRIDE = 8;
+ constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8;
+ constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64;
+ constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8;
+
+ uint32_t GetGroupNormTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+ uint32_t groupNum, const bool isMaxValue)
+ {
+ ASCENDC_HOST_ASSERT(typeSize > 0, return 0, "typeSize must be greater than 0.");
+ ASCENDC_HOST_ASSERT(groupNum > 0, return 0, "groupNum must be greater than 0.");
+
+ std::vector shapeDims = srcShape.GetDims();
+ const uint32_t n = static_cast(shapeDims[0]);
+ const uint32_t c = static_cast(shapeDims[1]);
+ const uint32_t h = static_cast(shapeDims[2]);
+ const uint32_t w = static_cast(shapeDims[3]);
+ ASCENDC_HOST_ASSERT(c != 0, return 0, "the value of c should not be zero!");
+ uint32_t mvTmpLen = n * groupNum * sizeof(float);
+ uint32_t hwLen = h * w * typeSize;
+ uint32_t dhwLen = {0};
+
+ mvTmpLen = (mvTmpLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+ GROUPNORM_ONE_BLK_SIZE;
+
+ if (isMaxValue) {
+ dhwLen = n * c * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+ GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float));
+ } else {
+ dhwLen = c / groupNum * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+ GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float));
+ }
+
+ if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) {
+ return GROUPNORM_TWO_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen;
+ }
+ return GROUPNORM_THREE_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen;
+ }
+ } // namespace name
+
+ void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+ const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue)
+ {
+ maxValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, true);
+ minValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, false);
+ }
+
+ void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize,
+ const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling)
+ {
+ ASCENDC_HOST_ASSERT(typeSize > 0, return, "typeSize must be greater than 0.");
+ ASCENDC_HOST_ASSERT(groupNum > 0, return, "groupNum must be greater than 0.");
+
+ std::vector shapeDims = srcShape.GetDims();
+ const uint32_t n = static_cast(shapeDims[0]);
+ const uint32_t c = static_cast(shapeDims[1]);
+ const uint32_t h = static_cast(shapeDims[2]);
+ const uint32_t w = static_cast(shapeDims[3]);
+ const uint32_t g = groupNum;
+ const uint32_t d = c / groupNum;
+ ASCENDC_HOST_ASSERT(c != 0, return, "the value of c should not be zero!");
+ // HW 32B 对齐
+ const uint32_t hwAlignSize = (typeSize * h * w + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) /
+ GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / typeSize;
+
+ const uint32_t dhwAlignSize = d * hwAlignSize;
+
+ const uint32_t inputXSize = n * c * hwAlignSize;
+ const uint32_t meanVarSize = n * g;
+
+ const uint32_t oneBlockNum = GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT;
+ const uint32_t meanTmpTensorSize = (meanVarSize + oneBlockNum - GROUPNORM_ONE_NUMBER) / oneBlockNum * oneBlockNum;
+
+ uint32_t meanVarTotalSize = 2 * meanTmpTensorSize;
+ if (typeSize == GROUPNORM_SIZEOF_FLOAT) {
+ meanVarTotalSize = GROUPNORM_ZERO_NUMBER;
+ }
+
+ uint32_t numberOfTmpBuf = GROUPNORM_THREE_TIMES;
+ if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) {
+ numberOfTmpBuf = GROUPNORM_TWO_TIMES;
+ }
+
+ const uint32_t tmpBufSize = stackBufferSize / GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT;
+ uint32_t oneTmpSize = (tmpBufSize - meanVarTotalSize) / numberOfTmpBuf;
+
+ // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group
+ uint32_t bsCurLength = oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * d * hwAlignSize) *
+ GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+
+ // 判断是否满足 smallShape 计算
+ uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE;
+ while ((dhwAlignSize / (GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT)) % k != 0) {
+ k--;
+ }
+ const bool smallShape = (hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) &&
+ (hwAlignSize * d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k);
+
+ // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围
+ if (smallShape) {
+ uint32_t mask1{GROUPNORM_MAX_MASK_VAL};
+ if (dhwAlignSize > GROUPNORM_MAX_MASK_VAL) {
+ while (mask1 != 0 && dhwAlignSize % mask1 != 0) {
+ mask1 -= GROUPNORM_STEP_MASK_VAL;
+ }
+ } else {
+ mask1 = dhwAlignSize;
+ }
+ ASCENDC_HOST_ASSERT(mask1 > 0, return, "mask1 must be greater than 0.");
+ const uint32_t maxBsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (dhwAlignSize / mask1) /
+ GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+ if (maxBsCurLength < bsCurLength) {
+ bsCurLength = maxBsCurLength;
+ }
+ }
+
+ if (typeSize == GROUPNORM_SIZEOF_HALF && bsCurLength * dhwAlignSize < c) {
+ return;
+ }
+
+ oneTmpSize = bsCurLength * d * hwAlignSize;
+
+ if (oneTmpSize > inputXSize) {
+ bsCurLength = meanVarSize;
+ oneTmpSize = inputXSize;
+ }
+
+ ASCENDC_HOST_ASSERT((oneTmpSize != GROUPNORM_ZERO_NUMBER), return, "the oneTmpSize should not be zero!");
+ if (oneTmpSize == GROUPNORM_ZERO_NUMBER) {
+ return;
+ }
+
+ const uint32_t inputRoundSize = oneTmpSize;
+ const uint32_t inputTailSize = inputXSize % oneTmpSize;
+
+ const uint32_t meanVarRoundSize = inputRoundSize / dhwAlignSize;
+ const uint32_t meanVarTailSize = inputTailSize / dhwAlignSize;
+
+ tiling.set_n(n);
+ tiling.set_c(c);
+ tiling.set_hw(h * w);
+ tiling.set_g(g);
+ tiling.set_d(d);
+ tiling.set_hwAlignSize(hwAlignSize);
+ tiling.set_dhwAlignSize(dhwAlignSize);
+ tiling.set_inputXSize(inputXSize);
+ tiling.set_meanVarSize(meanVarSize);
+ tiling.set_numberOfTmpBuf(numberOfTmpBuf);
+ tiling.set_meanTmpTensorPos(GROUPNORM_ZERO_NUMBER);
+ tiling.set_meanTmpTensorSize(meanTmpTensorSize);
+ tiling.set_varianceTmpTensorPos(meanTmpTensorSize);
+ tiling.set_varianceTmpTensorSize(meanTmpTensorSize);
+ tiling.set_tmpBufSize(tmpBufSize);
+ tiling.set_oneTmpSize(oneTmpSize);
+ tiling.set_firstTmpStartPos(meanVarTotalSize);
+ tiling.set_secondTmpStartPos(meanVarTotalSize + oneTmpSize);
+ tiling.set_thirdTmpStartPos(meanVarTotalSize + GROUPNORM_TWO_TIMES * oneTmpSize);
+ tiling.set_loopRound(inputXSize / oneTmpSize);
+ tiling.set_inputRoundSize(inputRoundSize);
+ tiling.set_inputTailSize(inputTailSize);
+ tiling.set_inputTailPos(inputXSize - inputTailSize);
+ tiling.set_meanVarRoundSize(meanVarRoundSize);
+ tiling.set_meanVarTailSize(meanVarTailSize);
+ tiling.set_meanVarTailPos(meanVarSize - meanVarTailSize);
+ tiling.set_bshCurLength(inputRoundSize);
+ tiling.set_bsCurLength(bsCurLength);
+ tiling.set_factor(GROUPNORM_ONE_FLOAT_VALUE / (d * h * w));
+ tiling.set_smallShape(smallShape);
+ }
+} // namespace AscendC
\ No newline at end of file
diff --git a/lib/normalization/groupnorm.h b/lib/normalization/groupnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..a80458c9e7c3b94bb698e985b2db18f9bc779469
--- /dev/null
+++ b/lib/normalization/groupnorm.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_H
+#define LIB_NORMALIZATION_GROUPNORM_H
+#if __CCE_AICORE__ == 220
+
+#include "kernel_tensor.h"
+#include "../../impl/normalization/groupnorm/groupnorm_common_impl.h"
+#include "kernel_tiling/kernel_tiling.h"
+namespace AscendC {
+#pragma begin_pipe(V)
+/*!
+ * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization.
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] output, output LocalTensor, shape is [n, C, H, W]
+ * \param [out] outputMean, output LocalTensor, shape is [n, groupNum]
+ * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum]
+ * \param [in] inputX, input LocalTensor, shape is [n, C, H, W]
+ * \param [in] gamma, input LocalTensor, shape is [C]
+ * \param [in] beta, input LocalTensor, shape is [C]
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] epsilon, weighting factor
+ * \param [in] tiling, groupnormtiling
+ */
+template
+__aicore__ inline void GroupNorm(const LocalTensor& output, const LocalTensor& outputMean,
+ const LocalTensor& outputVariance, const LocalTensor& inputX, const LocalTensor& gamma,
+ const LocalTensor& beta, const LocalTensor& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling)
+{
+ GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon,
+ tiling);
+}
+
+/*!
+ * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization.
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] output, output LocalTensor, shape is [n, C, H, W]
+ * \param [out] outputMean, output LocalTensor, shape is [n, groupNum]
+ * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum]
+ * \param [in] inputX, input LocalTensor, shape is [n, C, H, W]
+ * \param [in] gamma, input LocalTensor, shape is [C]
+ * \param [in] beta, input LocalTensor, shape is [C]
+ * \param [in] epsilon, weighting factor
+ * \param [in] tiling, groupnormtiling
+ */
+template
+__aicore__ inline void GroupNorm(const LocalTensor& output, const LocalTensor& outputMean,
+ const LocalTensor& outputVariance, const LocalTensor& inputX, const LocalTensor& gamma,
+ const LocalTensor& beta, const T epsilon, GroupNormTiling& tiling)
+{
+ GroupNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, epsilon, tiling);
+}
+#pragma end_pipe
+} // namespace AscendC
+#endif
+#endif // LIB_NORMALIZATION_GROUPNORM_H
\ No newline at end of file
diff --git a/lib/normalization/groupnorm_tiling.h b/lib/normalization/groupnorm_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc5e0da8f149729b58d34a29e6c26d41b9640727
--- /dev/null
+++ b/lib/normalization/groupnorm_tiling.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tiling.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_TILING_H
+#define LIB_NORMALIZATION_GROUPNORM_TILING_H
+#include "graph/tensor.h"
+#include "groupnorm_tilingdata.h"
+namespace AscendC {
+/*!
+ * \brief calculate max and min tmp buffer size for GroupNorm interface.
+ * \param [in] srcShape: input shape
+ * \param [in] typeSize: data type size: sizeof(TYPE)
+ * \param [in] isReuseSource: indicate whether to reuse source tensor.
+ * When enable isReuseSource, src tensor will be used as tmp buffer for calculation.
+ * \param [in] groupNum: number of groups to separate the channels into
+ * \param [out] maxValue: max size required for tmp buffer
+ * \param [out] minValue: min size required for tmp buffer
+ * \return flag for whether the tmp buffer size is calculated successfully
+ * If src shape is illegal for basic block, it will return false.
+ */
+void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+ const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue);
+
+/*!
+ * \brief calculate tiling params for GroupNorm interface
+ *
+ * \note stackBufferSize should be greater than min tmpSize from GetGroupNormMaxMinTmpSize
+ *
+ * \param [in] srcShape input shape
+ * \param [in] stackBufferSize input stack buffer size in uint of Byte, used as tmp buffer size for tiling
+ * \param [in] typeSize data type size: sizeof(TYPE)
+ * \param [in] isReuseSource indicate whether intermediate variables can reuse the input memory
+ * \param [in] groupNum: number of groups to separate the channels into
+ * \param [out] tiling GroupNorm tiling
+ * \return flag for whether the tiling is calculated successfully
+ if src shape and origin src shape is illeagl or input stackBufferSize is not big enough, it will return false.
+ */
+void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize,
+ const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling);
+}
+#endif // LIB_NORMALIZATION_GROUPNORM_TILING_H
diff --git a/lib/normalization/groupnorm_tilingdata.h b/lib/normalization/groupnorm_tilingdata.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f12c3358ac2c787eefe1085afa2f44febbd99f9
--- /dev/null
+++ b/lib/normalization/groupnorm_tilingdata.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tilingdata.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
+#define LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(GroupNormTiling)
+ TILING_DATA_FIELD_DEF(uint32_t, n);
+ TILING_DATA_FIELD_DEF(uint32_t, c);
+ TILING_DATA_FIELD_DEF(uint32_t, hw);
+ TILING_DATA_FIELD_DEF(uint32_t, g);
+ TILING_DATA_FIELD_DEF(uint32_t, d);
+ TILING_DATA_FIELD_DEF(uint32_t, hwAlignSize);
+ TILING_DATA_FIELD_DEF(uint32_t, dhwAlignSize);
+ TILING_DATA_FIELD_DEF(uint32_t, inputXSize);
+ TILING_DATA_FIELD_DEF(uint32_t, meanVarSize);
+ TILING_DATA_FIELD_DEF(uint32_t, numberOfTmpBuf);
+ TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorPos);
+ TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorSize);
+ TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorPos);
+ TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorSize);
+ TILING_DATA_FIELD_DEF(uint32_t, tmpBufSize);
+ TILING_DATA_FIELD_DEF(uint32_t, oneTmpSize);
+ TILING_DATA_FIELD_DEF(uint32_t, firstTmpStartPos);
+ TILING_DATA_FIELD_DEF(uint32_t, secondTmpStartPos);
+ TILING_DATA_FIELD_DEF(uint32_t, thirdTmpStartPos);
+ TILING_DATA_FIELD_DEF(uint32_t, loopRound);
+ TILING_DATA_FIELD_DEF(uint32_t, inputRoundSize);
+ TILING_DATA_FIELD_DEF(uint32_t, inputTailSize);
+ TILING_DATA_FIELD_DEF(uint32_t, inputTailPos);
+ TILING_DATA_FIELD_DEF(uint32_t, meanVarRoundSize);
+ TILING_DATA_FIELD_DEF(uint32_t, meanVarTailSize);
+ TILING_DATA_FIELD_DEF(uint32_t, meanVarTailPos);
+ TILING_DATA_FIELD_DEF(uint32_t, bshCurLength);
+ TILING_DATA_FIELD_DEF(uint32_t, bsCurLength);
+ TILING_DATA_FIELD_DEF(float, factor);
+ TILING_DATA_FIELD_DEF(bool, smallShape);
+END_TILING_DATA_DEF;
+}
+#endif // LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6d2e7694d3ebae87d71c1eefb1cf935f55ce6e28..80d3de2f3926b0a84fc46dd652a930f87a3072b2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -78,6 +78,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
${ASCENDC_TESTS_DIR}/normalization/batchnorm/test_operator_batchnorm.cpp
${ASCENDC_TESTS_DIR}/normalization/deepnorm/test_operator_deepnorm.cpp
${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernorm.cpp
+ ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_groupnorm.cpp
# ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgrad.cpp
${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgradbeta.cpp
${ASCENDC_TESTS_DIR}/normalization/rmsnorm/test_operator_rmsnorm.cpp
diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66088fdea6caefc167e6b72252d3a325ab4e437f
--- /dev/null
+++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
@@ -0,0 +1,312 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file test_operator_groupnorm.cpp
+ * \brief
+ */
+
+#include
+#define private public
+#define protect public
+#include "kernel_operator.h"
+
+using namespace std;
+using namespace AscendC;
+constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2;
+
+constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64;
+constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8;
+constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255;
+constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8;
+constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64;
+constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8;
+
+__aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo, const uint32_t stackBufferSize,
+ const uint32_t typeSize, const bool isReuseSource, const uint32_t groupNum, GroupNormTiling& tiling)
+{
+
+ uint32_t n = inputShapeInfo.shape[0];
+ uint32_t c = inputShapeInfo.shape[1];
+ uint32_t h = inputShapeInfo.shape[2];
+ uint32_t w = inputShapeInfo.shape[3];
+
+ tiling.n = n;
+ tiling.c = c;
+ tiling.hw = h * w;
+ tiling.g = groupNum;
+ tiling.d = c / tiling.g;
+ tiling.hwAlignSize = (typeSize * tiling.hw + ONE_BLK_SIZE - 1) /
+ ONE_BLK_SIZE * ONE_BLK_SIZE / typeSize;
+ tiling.dhwAlignSize = tiling.d * tiling.hwAlignSize;
+
+ tiling.inputXSize = n * c * tiling.hwAlignSize;
+ tiling.meanVarSize = n * tiling.g;
+
+ uint32_t oneBlockNum = ONE_BLK_SIZE / B32_BYTE_SIZE;
+ tiling.meanTmpTensorPos = 0;
+ tiling.meanTmpTensorSize = (tiling.meanVarSize + oneBlockNum - 1) / oneBlockNum * oneBlockNum;
+ tiling.varianceTmpTensorPos = tiling.meanTmpTensorSize;
+ tiling.varianceTmpTensorSize = tiling.meanTmpTensorSize;
+
+
+ uint32_t meanVarTotalSize = tiling.meanTmpTensorSize + tiling.varianceTmpTensorSize;
+ if (typeSize == B32_BYTE_SIZE) {
+ meanVarTotalSize = 0;
+ }
+
+ tiling.numberOfTmpBuf = THREE_OF_STACK_BUFFER;
+ if (isReuseSource && (typeSize == B32_BYTE_SIZE)) {
+ tiling.numberOfTmpBuf = TWO_OF_STACK_BUFFER;
+ }
+
+ tiling.tmpBufSize = stackBufferSize / ONE_BLK_SIZE * ONE_BLK_SIZE / B32_BYTE_SIZE;
+ tiling.oneTmpSize = (tiling.tmpBufSize - meanVarTotalSize) / tiling.numberOfTmpBuf;
+
+ // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group
+ tiling.bsCurLength = tiling.oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * tiling.d * tiling.hwAlignSize) *
+ GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+
+ // 判断是否满足 smallShape 计算
+ uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE;
+ while ((tiling.dhwAlignSize / (ONE_BLK_SIZE / B32_BYTE_SIZE)) % k != 0) {
+ k--;
+ }
+ tiling.smallShape = (tiling.hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) &&
+ (tiling.hwAlignSize * tiling.d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k);
+
+ // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围
+ if (tiling.smallShape) {
+ uint32_t mask1{GROUPNORM_MAX_MASK_VAL};
+ if (tiling.dhwAlignSize > GROUPNORM_MAX_MASK_VAL) {
+ while (tiling.dhwAlignSize % mask1 != 0) {
+ mask1 -= GROUPNORM_STEP_MASK_VAL;
+ }
+ } else {
+ mask1 = tiling.dhwAlignSize;
+ }
+ uint32_t max_bsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (tiling.dhwAlignSize / mask1) /
+ GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+ if (max_bsCurLength < tiling.bsCurLength) {
+ tiling.bsCurLength = max_bsCurLength;
+ }
+ }
+
+ if (typeSize == GROUPNORM_SIZEOF_HALF && tiling.bsCurLength * tiling.dhwAlignSize < c) {
+ return;
+ }
+
+ tiling.oneTmpSize = tiling.bsCurLength * tiling.d * tiling.hwAlignSize;
+
+ if (tiling.oneTmpSize > tiling.inputXSize) {
+ tiling.bsCurLength = tiling.meanVarSize;
+ tiling.oneTmpSize = tiling.inputXSize;
+ }
+
+ if (tiling.oneTmpSize == 0) {
+ return;
+ }
+
+ tiling.firstTmpStartPos = meanVarTotalSize;
+ tiling.secondTmpStartPos = tiling.firstTmpStartPos + tiling.oneTmpSize;
+ tiling.thirdTmpStartPos = tiling.secondTmpStartPos + tiling.oneTmpSize;
+
+ tiling.loopRound = tiling.inputXSize / tiling.oneTmpSize;
+
+ tiling.inputRoundSize = tiling.oneTmpSize;
+ tiling.inputTailSize = tiling.inputXSize % tiling.oneTmpSize;
+ tiling.inputTailPos = tiling.inputXSize - tiling.inputTailSize;
+
+ tiling.meanVarRoundSize = tiling.inputRoundSize / tiling.dhwAlignSize;
+ tiling.meanVarTailSize = tiling.inputTailSize / tiling.dhwAlignSize;
+ tiling.meanVarTailPos = tiling.meanVarSize - tiling.meanVarTailSize;
+
+ tiling.bshCurLength = tiling.inputRoundSize;
+
+ tiling.factor = 1.0f / (tiling.d * tiling.hw);
+ cout << tiling.n << ", " << tiling.c << ", " << tiling.hw << ", " << tiling.g << ", " << tiling.hwAlignSize << endl;
+ cout << "inputXSize: " << tiling.inputXSize << endl;
+ cout << "meanVarSize: " << tiling.meanVarSize << endl;
+ cout << "numberOfTmpBuf: " << tiling.numberOfTmpBuf << endl;
+ cout << "meanTmpTensorPos: " << tiling.meanTmpTensorPos << endl;
+ cout << "varianceTmpTensorPos: " << tiling.varianceTmpTensorPos << endl;
+ cout << "oneTmpSize: " << tiling.oneTmpSize << endl;
+ cout << "firstTmpStartPos: " << tiling.firstTmpStartPos << endl;
+ cout << "thirdTmpStartPos: " << tiling.thirdTmpStartPos << endl;
+ cout << "bsCurLength: " << tiling.bsCurLength << endl;
+ cout << "bshCurLength: " << tiling.bshCurLength << endl;
+ cout << "factor: " << tiling.factor << endl;
+ cout << "hwAlignSize: " << tiling.hwAlignSize << endl;
+ cout << "smallShape: " << tiling.smallShape << endl;
+
+}
+// __aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm,
+// GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g)
+template
+__aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm,
+ uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g)
+{
+ dataType epsilon = 0.001;
+ DataFormat dataFormat = DataFormat::ND;
+
+ GlobalTensor inputXGlobal;
+ GlobalTensor gammGlobal;
+ GlobalTensor betaGlobal;
+ GlobalTensor outputGlobal;
+ // GlobalTensor outputMeanGlobal;
+ // GlobalTensor outputVarianceGlobal;
+
+ uint32_t bshLength = n*c*h*w;
+ uint32_t bsLength = g*n;
+
+ inputXGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(inputXGm), bshLength);
+ gammGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(gammGm), c);
+ betaGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(betaGm), c);
+
+ outputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputGm), bshLength);
+ // outputMeanGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputMeanGm), bsLength);
+ // outputVarianceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputVarianceGm), bsLength);
+
+ TPipe pipe;
+ TQue inQueueX;
+ TQue inQueueGamma;
+ TQue inQueueBeta;
+ TQue outQueue;
+ // TQue outQueueMean;
+ // TQue outQueueVariance;
+ TBuf meanBuffer, varBuffer;
+
+ uint32_t hwAlignSize = (sizeof(dataType) * h * w + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE / sizeof(dataType);
+ pipe.InitBuffer(inQueueX, 1, sizeof(dataType) * n * c * hwAlignSize);
+ pipe.InitBuffer(inQueueGamma, 1, (sizeof(dataType) * c + 31) / 32 * 32);
+ pipe.InitBuffer(inQueueBeta, 1, (sizeof(dataType) * c + 31) / 32 * 32);
+ pipe.InitBuffer(outQueue, 1, sizeof(dataType) * n * c * hwAlignSize);
+ // pipe.InitBuffer(outQueueMean, 1, (sizeof(dataType) * g * n + 31) / 32 * 32);
+ // pipe.InitBuffer(outQueueVariance, 1, (sizeof(dataType) * g * n + 31) / 32 * 32);
+ pipe.InitBuffer(meanBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32);
+ pipe.InitBuffer(varBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32);
+
+ LocalTensor inputXLocal = inQueueX.AllocTensor();
+ LocalTensor gammaLocal = inQueueGamma.AllocTensor();
+ LocalTensor betaLocal = inQueueBeta.AllocTensor();
+ LocalTensor outputLocal = outQueue.AllocTensor();
+ // LocalTensor meanLocal = outQueueMean.AllocTensor();
+ // LocalTensor varianceLocal = outQueueVariance.AllocTensor();
+ LocalTensor meanLocal = meanBuffer.Get();
+ LocalTensor varianceLocal = varBuffer.Get();
+
+ DataCopyParams copyParams{static_cast(n*c), static_cast(h*w*sizeof(dataType)), 0, 0};
+ DataCopyPadParams padParams{true, 0, static_cast(hwAlignSize - h * w), 0};
+ DataCopyPad(inputXLocal, inputXGlobal, copyParams, padParams);
+ // DataCopy(inputXLocal, inputXGlobal, bshLength);
+ DataCopyParams copyParamsGamma{1, static_cast(c*sizeof(dataType)), 0, 0};
+ DataCopyPadParams padParamsGamma{false, 0, 0, 0};
+ DataCopyPad(gammaLocal, gammGlobal, copyParamsGamma, padParamsGamma);
+ DataCopyPad(betaLocal, betaGlobal, copyParamsGamma, padParamsGamma);
+
+ // DataCopy(gammaLocal, gammGlobal, c);
+ // DataCopy(betaLocal, betaGlobal, c);
+ PipeBarrier();
+
+ uint32_t stackBufferSize = 0;
+ {
+ LocalTensor stackBuffer;
+ bool ans = PopStackBuffer(stackBuffer);
+ stackBufferSize = stackBuffer.GetSize();
+ }
+
+ GroupNormTiling groupNormTiling;
+ uint32_t inputShape[4] = {n, c, h, w};
+ ShapeInfo shapeInfo{ (uint8_t)4, inputShape, (uint8_t)4, inputShape, dataFormat };
+
+ GetGroupNormNDTillingInfo(shapeInfo, stackBufferSize, sizeof(dataType), isReuseSource, g, groupNormTiling);
+
+ GroupNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
+ (dataType)epsilon, groupNormTiling);
+ PipeBarrier();
+
+ // DataCopy(outputGlobal, outputLocal, bshLength);
+ DataCopyPad(outputGlobal, outputLocal, copyParams);
+ // DataCopy(outputMeanGlobal, meanLocal, bsLength);
+ // DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
+
+ inQueueX.FreeTensor(inputXLocal);
+ inQueueGamma.FreeTensor(gammaLocal);
+ inQueueBeta.FreeTensor(betaLocal);
+ outQueue.FreeTensor(outputLocal);
+ // outQueueMean.FreeTensor(meanLocal);
+ // outQueueVariance.FreeTensor(varianceLocal);
+ PipeBarrier();
+}
+
+struct groupnormTestParams {
+ uint32_t n;
+ uint32_t c;
+ uint32_t h;
+ uint32_t w;
+ uint32_t g;
+ uint32_t typeSize;
+ void (*cal_func)(uint8_t*, uint8_t*, uint8_t*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
+};
+
+class groupnormTestSuite : public testing::Test, public testing::WithParamInterface {
+protected:
+ static void SetUpTestCase()
+ {
+ std::cout << "groupnormTestSuite SetUpTestCase" << std::endl;
+ }
+ static void TearDownTestCase()
+ {
+ std::cout << "groupnormTestSuite TearDownTestCase" << std::endl;
+ }
+ virtual void SetUp() {}
+ virtual void TearDown() {}
+};
+
+INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_groupnorm, groupnormTestSuite,
+ ::testing::Values(
+ groupnormTestParams { 2, 16, 8, 8, 4, sizeof(float), MainGroupnormTest