diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake
index 5c5a256621cfdf4b7299e6d9d51b736ae82454d9..376dc2ba07493f721de1273b7ced06d4bd22223d 100644
--- a/cmake/kernel_headers.cmake
+++ b/cmake/kernel_headers.cmake
@@ -82,7 +82,7 @@ file(CREATE_LINK ../normalization/kernel_operator_layernormgradbeta_intf.h
 file(CREATE_LINK ../normalization/layernormgradbeta.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/layernormgrad/layernormgradbeta.h SYMBOLIC)
 
-# layernorm
+# matmul
 file(MAKE_DIRECTORY  ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix)
 file(CREATE_LINK ../matmul ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matrix/matmul SYMBOLIC)
 file(CREATE_LINK matmul/matmul_intf.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/matmul_intf.h SYMBOLIC)
diff --git a/docs/README.md b/docs/README.md
index 9c5a21c79c6adc3df83835375ae99fe22548eed8..aaa4830cc5c3c4ae9a359122783b22b5de41b216 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -22,7 +22,7 @@
         <td> 按元素做量化计算，比如将half/float数据类型量化为int8_t数据类型。 </td>
     </tr>
     <tr>
-        <th rowspan="6"> 数据归一化 </th>
+        <th rowspan="7"> 数据归一化 </th>
         <td> BatchNorm </td>
         <td> 对于每个batch中的样本，对其输入的每个特征在batch的维度上进行归一化。 </td>
     </tr>
@@ -46,6 +46,10 @@
         <td> RmsNorm </td>
         <td> 实现对shape大小为[B，S，H]的输入数据的RmsNorm归一化。 </td>
     </tr>
+    <tr>
+        <td> GroupNorm </td>
+        <td> 对输入数据在 channel 维度进行分组并对每个组做归一化的方法。 </td>
+    </tr>
     <tr>
         <th rowspan="16"> 激活函数 </th>
         <td> AdjustSoftMaxRes </td>
diff --git a/impl/normalization/groupnorm/groupnorm_common_impl.h b/impl/normalization/groupnorm/groupnorm_common_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8dd23e0c1c63b684f34bacee87d91201e27dc88
--- /dev/null
+++ b/impl/normalization/groupnorm/groupnorm_common_impl.h
@@ -0,0 +1,467 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_common_impl.h
+ * \brief
+ */
+
+#ifndef IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
+#define IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_pop_stack_buffer.h"
+#include "kernel_tiling/kernel_tiling.h"
+
+namespace AscendC {
+    namespace {
+        constexpr uint32_t GROUPNORM_MASK_MAX_VAL = 64;
+        constexpr uint32_t GROUPNORM_MASK_SMALLEST_VAL = 8;
+        constexpr uint32_t GROUPNORM_MASK_STEP_VAL = 8;
+        constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 8;
+    } // namespace
+
+template <typename T> struct GroupNormParams
+{
+    __aicore__ GroupNormParams(){};
+    LocalTensor<T> tempTensorA;
+    LocalTensor<T> tempTensorB;
+    LocalTensor<T> tempTensorC;
+    LocalTensor<T> meanTmpTensor;
+    LocalTensor<T> varianceTmpTensor;
+};
+
+__aicore__ inline uint32_t GetGroupNormWholeReduceMask1(const GroupNormTiling& tiling)
+{
+    uint32_t mask1{0};
+    if (tiling.dhwAlignSize > GROUPNORM_MASK_MAX_VAL) {
+        mask1 = GROUPNORM_MASK_MAX_VAL;
+        while (mask1 != 0 && tiling.dhwAlignSize % mask1 != 0) {
+            mask1 -= GROUPNORM_MASK_STEP_VAL;
+        }
+        return mask1;
+    }
+    return tiling.dhwAlignSize;
+}
+
+__aicore__ inline void GetGroupNormOutputMean(const LocalTensor<float>& x_in,
+    const LocalTensor<float>& tmp, const LocalTensor<float>& mean,
+    const GroupNormTiling& tiling)
+{
+    for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.dhwAlignSize;
+        ReduceSum<float>(mean[i], x_in[buffIndex], tmp[buffIndex], tiling.dhwAlignSize);
+    }
+    PipeBarrier<PIPE_V>();  
+
+    Muls(mean, mean, tiling.factor, tiling.bsCurLength);
+
+    // mean will be used to GetValue() to get scalar value
+    auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+    SetFlag<HardEvent::V_S>(eventIdVToS);
+    WaitFlag<HardEvent::V_S>(eventIdVToS);
+}
+
+__aicore__ inline void GetGroupNormOutputVar(const LocalTensor<float>& x_in,
+    const LocalTensor<float>& tmp1, const LocalTensor<float>& tmp2,
+    const LocalTensor<float>& mean, const LocalTensor<float>& var, const GroupNormTiling& tiling)
+{
+    for (uint32_t i = 0; i < tiling.d * tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.hwAlignSize;
+        Adds(tmp1[buffIndex], x_in[buffIndex], -1.0f * mean.GetValue(i / tiling.d), tiling.hw);
+    }
+    PipeBarrier<PIPE_V>();
+
+    Mul(tmp2, tmp1, tmp1, tiling.bshCurLength);
+    PipeBarrier<PIPE_V>();
+
+    for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.dhwAlignSize;
+        ReduceSum<float>(var[i], tmp2[buffIndex], tmp2[buffIndex], tiling.dhwAlignSize);
+    }
+    PipeBarrier<PIPE_V>();      
+
+    Muls(var, var, tiling.factor, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();      
+}
+
+__aicore__ inline void GetGroupNormOutputPre(const LocalTensor<float>& inout,
+    const LocalTensor<float>& tmp, const LocalTensor<float>& variance,
+    const GroupNormTiling& tiling, const float epsilon)
+{
+    Adds(tmp, variance, epsilon, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();
+    Ln(tmp, tmp, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();
+    // Multiply by -0.5f to convert the logarithmic result to the logarithm of the reciprocal of the standard deviation
+    Muls(tmp, tmp, -0.5f, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();
+    Exp(tmp, tmp, tiling.bsCurLength);
+
+    auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+    SetFlag<HardEvent::V_S>(eventIdVToS);
+    WaitFlag<HardEvent::V_S>(eventIdVToS);
+
+    // pre norm
+    for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.dhwAlignSize;
+        Muls(inout[buffIndex], inout[buffIndex], tmp.GetValue(i), tiling.dhwAlignSize);
+    }
+
+    // tmp will be written later 
+    auto eventIdSToV = GetTPipePtr()->FetchEventID(HardEvent::S_V);
+    SetFlag<HardEvent::V_S>(eventIdSToV);
+    WaitFlag<HardEvent::V_S>(eventIdSToV);
+
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void GetGroupNormOutput(const LocalTensor<float>& inout,
+    const LocalTensor<float>& gamma, const LocalTensor<float>& beta,
+    const GroupNormTiling& tiling, const int32_t loopCount)
+{
+    size_t channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d;
+    for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) {
+        Muls(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize],
+        gamma.GetValue(channelIndex % tiling.c), tiling.hw);
+        channelIndex += 1;
+    }    
+    PipeBarrier<PIPE_V>();      
+
+    channelIndex = loopCount * tiling.meanVarRoundSize * tiling.d;
+    for (uint32_t channel_offset = 0; channel_offset < tiling.bsCurLength * tiling.d; ++channel_offset) {
+        Adds(inout[channel_offset * tiling.hwAlignSize], inout[channel_offset * tiling.hwAlignSize],
+        beta.GetValue(channelIndex % tiling.c), tiling.hw);
+        channelIndex += 1;
+    }    
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void GroupNormExe(const LocalTensor<half>& inputX,
+    const LocalTensor<half>& gamma, const LocalTensor<half>& beta,
+    const LocalTensor<half>& output, const LocalTensor<float>& outputMean, const LocalTensor<float>& outputVariance,
+    const half epsilon, const GroupNormTiling& tiling, const GroupNormParams<float>& params, const int32_t loopCount)
+{
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+    Duplicate(tempTensorA, 0.0f, tiling.bshCurLength);
+    PipeBarrier<PIPE_V>();
+    Cast<float, half>(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+
+    GetGroupNormOutputMean(tempTensorB, tempTensorC, outputMean, tiling);
+
+    GetGroupNormOutputVar(tempTensorB, tempTensorB, tempTensorC, outputMean, outputVariance, tiling);
+
+    GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast<float>(epsilon));
+
+    Cast<float, half>(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c);
+    PipeBarrier<PIPE_V>();
+    Cast<float, half>(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c);
+    PipeBarrier<PIPE_V>();
+
+    GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount);
+   
+    Cast<half, float>(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+}
+
+
+__aicore__ inline void GroupNormExe(const LocalTensor<float>& inputX,
+    const LocalTensor<float>& gamma, const LocalTensor<float>& beta,
+    const LocalTensor<float>& output, const LocalTensor<float>& outputMean, const LocalTensor<float>& outputVariance,
+    const float epsilon, const GroupNormTiling& tiling, const GroupNormParams<float>& params, const int32_t loopCount)
+{
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+
+    GetGroupNormOutputMean(inputX, output, outputMean, tiling);
+
+    Duplicate(output, 0.0f, tiling.bshCurLength);
+    PipeBarrier<PIPE_V>();
+
+    GetGroupNormOutputVar(inputX, output, tempTensorC, outputMean, outputVariance, tiling);
+
+    GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon);
+
+    GetGroupNormOutput(output, gamma, beta, tiling, loopCount);
+}
+
+__aicore__ inline void GroupNormExeSmallShape(const LocalTensor<half>& inputX,
+    const LocalTensor<half>& gamma, const LocalTensor<half>& beta,
+    const LocalTensor<half>& output, const LocalTensor<float>& outputMean, const LocalTensor<float>& outputVariance,
+    const half epsilon, const GroupNormTiling& tiling, const GroupNormParams<float>& params, const int32_t loopCount)
+{
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+    Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize * tiling.numberOfTmpBuf);
+    PipeBarrier<PIPE_V>();
+
+    Cast<float, half>(tempTensorB, inputX, RoundMode::CAST_NONE, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+
+    uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling);
+    ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); });
+
+    uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize;
+    uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL;
+    PipeBarrier<PIPE_V>();  
+
+    WholeReduceSum<float, true>(tempTensorC, tempTensorB, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength);
+    auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+    SetFlag<HardEvent::V_S>(eventIdVToS);
+    WaitFlag<HardEvent::V_S>(eventIdVToS);
+
+    for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.dhwAlignSize;
+        Adds(tempTensorB[buffIndex], tempTensorB[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d,
+        {1, 1, static_cast<uint8_t>(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE), static_cast<uint8_t>(tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE)});
+    }
+    PipeBarrier<PIPE_V>();  
+
+    Mul(tempTensorC, tempTensorB, tempTensorB, tiling.bshCurLength);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();
+
+    GetGroupNormOutputPre(tempTensorB, tempTensorA, outputVariance, tiling, static_cast<float>(epsilon));
+
+    Cast<float, half>(tempTensorA, gamma, RoundMode::CAST_NONE, tiling.c);
+    PipeBarrier<PIPE_V>();
+    Cast<float, half>(tempTensorC, beta, RoundMode::CAST_NONE, tiling.c);
+    PipeBarrier<PIPE_V>();
+
+    GetGroupNormOutput(tempTensorB, tempTensorA, tempTensorC, tiling, loopCount);
+   
+    Cast<half, float>(output, tempTensorB, RoundMode::CAST_NONE, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void GroupNormExeSmallShape(const LocalTensor<float>& inputX,
+    const LocalTensor<float>& gamma, const LocalTensor<float>& beta,
+    const LocalTensor<float>& output, const LocalTensor<float>& outputMean, const LocalTensor<float>& outputVariance,
+    const float epsilon, const GroupNormTiling& tiling, const GroupNormParams<float>& params, const int32_t loopCount)
+{
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+    Duplicate(output, 0.0f, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+    Duplicate(tempTensorC, 0.0f, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+    uint32_t mask1 = GetGroupNormWholeReduceMask1(tiling);
+    ASCENDC_ASSERT((mask1 > 0), { KERNEL_LOG(KERNEL_ERROR, "mask1 must > 0!"); });
+
+    uint32_t repeat1 = tiling.dhwAlignSize / mask1 * tiling.meanVarRoundSize;
+    uint32_t mask2 = tiling.dhwAlignSize / mask1 * GROUPNORM_MASK_SMALLEST_VAL;
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(tempTensorC, inputX, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(outputMean, tempTensorC, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    Muls(outputMean, outputMean, tiling.factor, tiling.bsCurLength);
+    auto eventIdVToS = GetTPipePtr()->FetchEventID(HardEvent::V_S);
+    SetFlag<HardEvent::V_S>(eventIdVToS);
+    WaitFlag<HardEvent::V_S>(eventIdVToS);
+
+    auto repeatStride = tiling.hwAlignSize / GROUPNORM_ONE_BLK_SIZE;
+    for (uint32_t i = 0; i < tiling.bsCurLength; ++i) {
+        uint32_t buffIndex = i * tiling.dhwAlignSize;
+        Adds(output[buffIndex], inputX[buffIndex], -1.0f * outputMean.GetValue(i), tiling.hw, tiling.d,
+        {1, 1, static_cast<uint8_t>(repeatStride), static_cast<uint8_t>(repeatStride)});
+    }
+    PipeBarrier<PIPE_V>();
+
+    Mul(tempTensorC, output, output, tiling.bshCurLength);
+    PipeBarrier<PIPE_V>();
+
+    Duplicate(tempTensorA, 0.0f, tiling.inputRoundSize);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(tempTensorA, tempTensorC, mask1, repeat1, GROUPNORM_MASK_SMALLEST_VAL, DEFAULT_BLK_STRIDE, mask1 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    WholeReduceSum<float, true>(outputVariance, tempTensorA, mask2, tiling.bsCurLength, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, mask2 / GROUPNORM_MASK_SMALLEST_VAL);
+    PipeBarrier<PIPE_V>();
+
+    Muls(outputVariance, outputVariance, tiling.factor, tiling.bsCurLength);
+    PipeBarrier<PIPE_V>();
+    GetGroupNormOutputPre(output, tempTensorA, outputVariance, tiling, epsilon);
+
+    GetGroupNormOutput(output, gamma, beta, tiling, loopCount);
+}
+
+template <bool isReuseSource = false>
+__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor<half>& inputX,
+    const LocalTensor<half>& outputMean, const LocalTensor<half>& outputVariance,
+    const LocalTensor<float>& stackBuffer, const GroupNormTiling& tiling, GroupNormParams<float>& params)
+{
+    params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+    params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+    params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+    params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos];
+    params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos];
+
+    ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+        tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+    });
+    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)",
+        stackBuffer.GetSize(), tiling.tmpBufSize);
+    });
+}
+
+template <bool isReuseSource = false>
+__aicore__ inline void GetGroupNormNDTensorInfo(const LocalTensor<float>& inputX,
+    const LocalTensor<float>& outputMean, const LocalTensor<float>& outputVariance,
+    const LocalTensor<float>& stackBuffer, const GroupNormTiling& tiling, GroupNormParams<float>& params)
+{
+    params.meanTmpTensor = outputMean;
+    params.varianceTmpTensor = outputVariance;
+
+    if constexpr (isReuseSource) {
+        params.tempTensorA = inputX;
+        params.tempTensorB = stackBuffer[tiling.firstTmpStartPos];
+        params.tempTensorC = stackBuffer[tiling.secondTmpStartPos];
+
+        ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+            KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+            tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+        });
+    } else {
+        params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+        params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+        params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+       
+        ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+            KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+            tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+        });
+    }
+   
+    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tmpBufSize is (%d)",
+        stackBuffer.GetSize(), tiling.tmpBufSize);
+    });
+}
+
+__aicore__ inline void GetOutputMeanVariance(const LocalTensor<half>& outputMean,
+    const LocalTensor<half>& outputVariance, const GroupNormTiling& tiling, const GroupNormParams<float>& params)
+{
+    Cast<half, float>(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g);
+    Cast<half, float>(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, tiling.n * tiling.g);
+}
+
+template <typename T>
+__aicore__ inline void GroupNormNDCommon(const LocalTensor<T>& inputX,
+    const LocalTensor<T>& gamma, const LocalTensor<T>& beta,
+    const LocalTensor<T>& output, const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance,
+    const T epsilon, GroupNormTiling& tiling, const GroupNormParams<float>& params)
+{
+    uint32_t inputOffset = 0;
+    uint32_t mvOffset = 0;
+
+    for (uint32_t index = 0; index < tiling.loopRound; index++) {
+        if (tiling.smallShape) {
+            GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset],
+            params.meanTmpTensor[mvOffset],
+            params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index);
+        } else {
+            GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset],
+            params.meanTmpTensor[mvOffset],
+            params.varianceTmpTensor[mvOffset], epsilon, tiling, params, index);
+        }
+
+        inputOffset += tiling.inputRoundSize;
+        mvOffset += tiling.meanVarRoundSize;
+    }
+
+    if (tiling.inputTailSize > 0) {
+        tiling.bshCurLength = tiling.inputTailSize;
+        tiling.bsCurLength = tiling.meanVarTailSize;
+
+        inputOffset = tiling.inputTailPos;
+        mvOffset = tiling.meanVarTailPos;
+
+        if (tiling.smallShape) {
+            GroupNormExeSmallShape(inputX[inputOffset], gamma, beta, output[inputOffset],
+            params.meanTmpTensor[mvOffset],
+            params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound);
+        } else {
+            GroupNormExe(inputX[inputOffset], gamma, beta, output[inputOffset],
+            params.meanTmpTensor[mvOffset],
+            params.varianceTmpTensor[mvOffset], epsilon, tiling, params, tiling.loopRound);
+        }
+
+        // revert to normal round size from tail size, for the next iteration calculation
+        tiling.bshCurLength = tiling.inputRoundSize;
+        tiling.bsCurLength = tiling.meanVarRoundSize;
+    }
+
+    if constexpr (sizeof(T) == sizeof(half)) {
+        GetOutputMeanVariance(outputMean, outputVariance, tiling, params);
+    }
+}
+
+template <typename T, bool isReuseSource = false>
+__aicore__ inline void GroupNormImpl(const LocalTensor<T>& output,
+    const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance,
+    const LocalTensor<T>& inputX, const LocalTensor<T>& gamma, const LocalTensor<T>& beta,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling)
+{
+    ASCENDC_ASSERT((tiling.oneTmpSize > 0), { KERNEL_LOG(KERNEL_ERROR, "tiling.oneTmpSize must > 0!"); });
+
+    if ASCEND_IS_AIC {
+        return;
+    }
+    LocalTensor<float> stackBuffer = sharedTmpBuffer.ReinterpretCast<float>();
+    ASCENDC_ASSERT((stackBuffer.GetSize() > 0),{ KERNEL_LOG(KERNEL_ERROR, "sharedTmpBuffer Size must > 0!"); });
+
+    GroupNormParams<float> params;
+    GetGroupNormNDTensorInfo<isReuseSource>(inputX, outputMean, outputVariance, stackBuffer, tiling, params);
+
+    GroupNormNDCommon<T>(inputX, gamma, beta, output, outputMean, outputVariance, epsilon, tiling, params);
+}
+
+template <typename T, bool isReuseSource = false>
+__aicore__ inline void GroupNormImpl(const LocalTensor<T>& output,
+    const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance,
+    const LocalTensor<T>& inputX, const LocalTensor<T>& gamma, const LocalTensor<T>& beta,
+    const T epsilon, GroupNormTiling& tiling)
+{
+    LocalTensor<uint8_t> sharedTmpBuffer;
+    bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
+    ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+
+    GroupNormImpl<T, isReuseSource>(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon, tiling);
+}
+
+} // namespace AscendC
+#endif // IMPL_NORMALIZATION_GROUPNORM_GROUPNORM_COMMON_IMPL_H
\ No newline at end of file
diff --git a/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..369df14cbb8295180504bec61b500aa4b0265ccc
--- /dev/null
+++ b/impl/normalization/groupnorm/groupnorm_tiling_impl.cpp
@@ -0,0 +1,204 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tiling_impl.cpp
+ * \brief
+ */
+
+#include "lib/normalization/groupnorm_tiling.h"
+#include "impl/host_log.h"
+
+namespace optiling {
+    REGISTER_TILING_DATA_CLASS(GroupNormTilingOpApi, GroupNormTiling);
+} // namespace optiling
+namespace AscendC {
+    namespace {
+        constexpr uint32_t GROUPNORM_SIZEOF_FLOAT = 4;
+        constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2;
+        constexpr uint32_t GROUPNORM_ONE_BLK_SIZE = 32;
+        constexpr uint32_t GROUPNORM_THREE_TIMES = 3;
+        constexpr uint32_t GROUPNORM_TWO_TIMES = 2;
+        constexpr uint32_t GROUPNORM_ONE_NUMBER = 1;
+        constexpr uint32_t GROUPNORM_ZERO_NUMBER = 0;
+        constexpr float GROUPNORM_ONE_FLOAT_VALUE = 1.0f;
+
+        constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64;
+        constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8;
+        constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255;
+        constexpr uint32_t GROUPNORM_REDUCESUM1_DST_REPEAT_STRIDE = 8;
+        constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8;
+        constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64;
+        constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8;
+
+        uint32_t GetGroupNormTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+            uint32_t groupNum, const bool isMaxValue)
+        {
+            ASCENDC_HOST_ASSERT(typeSize > 0, return 0, "typeSize must be greater than 0.");
+            ASCENDC_HOST_ASSERT(groupNum > 0, return 0, "groupNum must be greater than 0.");
+
+            std::vector<int64_t> shapeDims = srcShape.GetDims();
+            const uint32_t n = static_cast<uint32_t>(shapeDims[0]);
+            const uint32_t c = static_cast<uint32_t>(shapeDims[1]);
+            const uint32_t h = static_cast<uint32_t>(shapeDims[2]);
+            const uint32_t w = static_cast<uint32_t>(shapeDims[3]);
+            ASCENDC_HOST_ASSERT(c != 0, return 0, "the value of c should not be zero!");
+            uint32_t mvTmpLen = n * groupNum * sizeof(float);
+            uint32_t hwLen = h * w * typeSize;
+            uint32_t dhwLen = {0};
+
+            mvTmpLen = (mvTmpLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+                GROUPNORM_ONE_BLK_SIZE;
+            
+            if (isMaxValue) {
+                dhwLen = n * c * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+                    GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float));
+            } else {
+                dhwLen = c / groupNum * ((hwLen + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / GROUPNORM_ONE_BLK_SIZE *
+                    GROUPNORM_ONE_BLK_SIZE / typeSize * sizeof(float));
+            }
+
+            if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) {
+                return GROUPNORM_TWO_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen;
+            }
+            return GROUPNORM_THREE_TIMES * dhwLen + GROUPNORM_TWO_TIMES * mvTmpLen;
+        }
+    } // namespace name
+    
+    void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+        const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue)
+    {
+        maxValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, true);
+        minValue = GetGroupNormTmpSize(srcShape, typeSize, isReuseSource, groupNum, false);
+    }
+
+    void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize,
+        const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling) 
+    {
+        ASCENDC_HOST_ASSERT(typeSize > 0, return, "typeSize must be greater than 0.");
+        ASCENDC_HOST_ASSERT(groupNum > 0, return, "groupNum must be greater than 0.");
+
+        std::vector<int64_t> shapeDims = srcShape.GetDims();
+        const uint32_t n = static_cast<uint32_t>(shapeDims[0]);
+        const uint32_t c = static_cast<uint32_t>(shapeDims[1]);
+        const uint32_t h = static_cast<uint32_t>(shapeDims[2]);
+        const uint32_t w = static_cast<uint32_t>(shapeDims[3]);
+        const uint32_t g = groupNum;
+        const uint32_t d = c / groupNum;
+        ASCENDC_HOST_ASSERT(c != 0, return, "the value of c should not be zero!");
+        // HW 32B 对齐
+        const uint32_t hwAlignSize = (typeSize * h * w + GROUPNORM_ONE_BLK_SIZE - GROUPNORM_ONE_NUMBER) / 
+            GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / typeSize;
+        
+        const uint32_t dhwAlignSize = d * hwAlignSize;
+
+        const uint32_t inputXSize = n * c * hwAlignSize;
+        const uint32_t meanVarSize = n * g;
+
+        const uint32_t oneBlockNum = GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT;
+        const uint32_t meanTmpTensorSize = (meanVarSize + oneBlockNum - GROUPNORM_ONE_NUMBER) / oneBlockNum * oneBlockNum;
+
+        uint32_t meanVarTotalSize = 2 * meanTmpTensorSize;
+        if (typeSize == GROUPNORM_SIZEOF_FLOAT) {
+            meanVarTotalSize = GROUPNORM_ZERO_NUMBER;
+        }
+
+        uint32_t numberOfTmpBuf = GROUPNORM_THREE_TIMES;
+        if (isReuseSource && (typeSize == GROUPNORM_SIZEOF_FLOAT)) {
+            numberOfTmpBuf = GROUPNORM_TWO_TIMES;
+        }
+
+        const uint32_t tmpBufSize = stackBufferSize / GROUPNORM_ONE_BLK_SIZE * GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT;
+        uint32_t oneTmpSize = (tmpBufSize - meanVarTotalSize) / numberOfTmpBuf;
+
+        // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group
+        uint32_t bsCurLength = oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * d * hwAlignSize) *
+            GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+
+        // 判断是否满足 smallShape 计算
+        uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE;
+        while ((dhwAlignSize / (GROUPNORM_ONE_BLK_SIZE / GROUPNORM_SIZEOF_FLOAT)) % k != 0) {
+            k--;
+        }
+        const bool smallShape = (hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && 
+        (hwAlignSize * d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k);
+
+        // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围
+        if (smallShape) {
+            uint32_t mask1{GROUPNORM_MAX_MASK_VAL};
+            if (dhwAlignSize > GROUPNORM_MAX_MASK_VAL) {
+                while (mask1 != 0 && dhwAlignSize % mask1 != 0) {
+                    mask1 -= GROUPNORM_STEP_MASK_VAL;
+                }
+            } else {
+                mask1 = dhwAlignSize;
+            }
+            ASCENDC_HOST_ASSERT(mask1 > 0, return, "mask1 must be greater than 0.");
+            const uint32_t maxBsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (dhwAlignSize / mask1) / 
+                GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+            if (maxBsCurLength < bsCurLength) {
+                bsCurLength = maxBsCurLength;
+            }
+        }
+
+        if (typeSize == GROUPNORM_SIZEOF_HALF && bsCurLength * dhwAlignSize < c) {
+            return;
+        }
+
+        oneTmpSize = bsCurLength * d * hwAlignSize;
+
+        if (oneTmpSize > inputXSize) {
+            bsCurLength = meanVarSize;
+            oneTmpSize = inputXSize;
+        }
+
+        ASCENDC_HOST_ASSERT((oneTmpSize != GROUPNORM_ZERO_NUMBER), return, "the oneTmpSize should not be zero!");
+        if (oneTmpSize == GROUPNORM_ZERO_NUMBER) {
+            return;
+        }
+
+        const uint32_t inputRoundSize = oneTmpSize;
+        const uint32_t inputTailSize = inputXSize % oneTmpSize;
+
+        const uint32_t meanVarRoundSize = inputRoundSize / dhwAlignSize;
+        const uint32_t meanVarTailSize = inputTailSize / dhwAlignSize;
+
+        tiling.set_n(n);
+        tiling.set_c(c);
+        tiling.set_hw(h * w);
+        tiling.set_g(g);
+        tiling.set_d(d);
+        tiling.set_hwAlignSize(hwAlignSize);
+        tiling.set_dhwAlignSize(dhwAlignSize);
+        tiling.set_inputXSize(inputXSize);
+        tiling.set_meanVarSize(meanVarSize);
+        tiling.set_numberOfTmpBuf(numberOfTmpBuf);
+        tiling.set_meanTmpTensorPos(GROUPNORM_ZERO_NUMBER);
+        tiling.set_meanTmpTensorSize(meanTmpTensorSize);
+        tiling.set_varianceTmpTensorPos(meanTmpTensorSize);
+        tiling.set_varianceTmpTensorSize(meanTmpTensorSize);
+        tiling.set_tmpBufSize(tmpBufSize);
+        tiling.set_oneTmpSize(oneTmpSize);
+        tiling.set_firstTmpStartPos(meanVarTotalSize);
+        tiling.set_secondTmpStartPos(meanVarTotalSize + oneTmpSize);
+        tiling.set_thirdTmpStartPos(meanVarTotalSize + GROUPNORM_TWO_TIMES * oneTmpSize);
+        tiling.set_loopRound(inputXSize / oneTmpSize);
+        tiling.set_inputRoundSize(inputRoundSize);
+        tiling.set_inputTailSize(inputTailSize);
+        tiling.set_inputTailPos(inputXSize - inputTailSize);
+        tiling.set_meanVarRoundSize(meanVarRoundSize);
+        tiling.set_meanVarTailSize(meanVarTailSize);
+        tiling.set_meanVarTailPos(meanVarSize - meanVarTailSize);
+        tiling.set_bshCurLength(inputRoundSize);
+        tiling.set_bsCurLength(bsCurLength);
+        tiling.set_factor(GROUPNORM_ONE_FLOAT_VALUE / (d * h * w));
+        tiling.set_smallShape(smallShape);
+    }
+} // namespace AscendC
\ No newline at end of file
diff --git a/lib/normalization/groupnorm.h b/lib/normalization/groupnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..a80458c9e7c3b94bb698e985b2db18f9bc779469
--- /dev/null
+++ b/lib/normalization/groupnorm.h
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_H
+#define LIB_NORMALIZATION_GROUPNORM_H
+#if __CCE_AICORE__ == 220
+
+#include "kernel_tensor.h"
+#include "../../impl/normalization/groupnorm/groupnorm_common_impl.h"
+#include "kernel_tiling/kernel_tiling.h"
+namespace AscendC {
+#pragma begin_pipe(V)
+/*!
+ * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization.
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] output, output LocalTensor, shape is [n, C, H, W]
+ * \param [out] outputMean, output LocalTensor, shape is [n, groupNum]
+ * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum]
+ * \param [in] inputX, input LocalTensor, shape is [n, C, H, W]
+ * \param [in] gamma, input LocalTensor, shape is [C]
+ * \param [in] beta, input LocalTensor, shape is [C]
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] epsilon, weighting factor
+ * \param [in] tiling, groupnormtiling
+ */
+template <typename T, bool isReuseSource = false>
+__aicore__ inline void GroupNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean,
+    const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamma,
+    const LocalTensor<T>& beta, const LocalTensor<uint8_t>& sharedTmpBuffer, const T epsilon, GroupNormTiling& tiling)
+{
+    GroupNormImpl<T, isReuseSource>(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon,
+        tiling);
+}
+
+/*!
+ * \brief Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization.
+ *
+ * \note support data type: half and float
+ *
+ * \param [out] output, output LocalTensor, shape is [n, C, H, W]
+ * \param [out] outputMean, output LocalTensor, shape is [n, groupNum]
+ * \param [out] outputVariance, output LocalTensor, shape is [n, groupNum]
+ * \param [in] inputX, input LocalTensor, shape is [n, C, H, W]
+ * \param [in] gamma, input LocalTensor, shape is [C]
+ * \param [in] beta, input LocalTensor, shape is [C]
+ * \param [in] epsilon, weighting factor
+ * \param [in] tiling, groupnormtiling
+ */
+template <typename T, bool isReuseSource = false>
+__aicore__ inline void GroupNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean,
+    const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamma,
+    const LocalTensor<T>& beta, const T epsilon, GroupNormTiling& tiling)
+{
+    GroupNormImpl<T, isReuseSource>(output, outputMean, outputVariance, inputX, gamma, beta, epsilon, tiling);
+}
+#pragma end_pipe
+} // namespace AscendC
+#endif
+#endif // LIB_NORMALIZATION_GROUPNORM_H
\ No newline at end of file
diff --git a/lib/normalization/groupnorm_tiling.h b/lib/normalization/groupnorm_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc5e0da8f149729b58d34a29e6c26d41b9640727
--- /dev/null
+++ b/lib/normalization/groupnorm_tiling.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tiling.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_TILING_H
+#define LIB_NORMALIZATION_GROUPNORM_TILING_H
+#include "graph/tensor.h"
+#include "groupnorm_tilingdata.h"
+namespace AscendC {
+/*!
+ * \brief calculate max and min tmp buffer size for GroupNorm interface.
+ * \param [in] srcShape: input shape
+ * \param [in] typeSize: data type size: sizeof(TYPE)
+ * \param [in] isReuseSource: indicate whether to reuse source tensor.
+ *             When enable isReuseSource, src tensor will be used as tmp buffer for calculation.
+ * \param [in] groupNum: number of groups to separate the channels into
+ * \param [out] maxValue: max size required for tmp buffer
+ * \param [out] minValue: min size required for tmp buffer
+ * \return flag for whether the tmp buffer size is calculated successfully
+ *         If src shape is illegal for basic block, it will return false.
+ */
+void GetGroupNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource,
+    const uint32_t groupNum, uint32_t& maxValue, uint32_t& minValue);
+
+/*!
+ * \brief calculate tiling params for GroupNorm interface
+ *
+ * \note stackBufferSize should be greater than min tmpSize from GetGroupNormMaxMinTmpSize
+  *
+ * \param [in] srcShape input shape
+ * \param [in] stackBufferSize input stack buffer size in uint of Byte, used as tmp buffer size for tiling
+ * \param [in] typeSize data type size: sizeof(TYPE)
+ * \param [in] isReuseSource indicate whether intermediate variables can reuse the input memory
+ * \param [in] groupNum: number of groups to separate the channels into
+ * \param [out] tiling GroupNorm tiling
+ * \return flag for whether the tiling is calculated successfully
+   if src shape and origin src shape is illeagl or input stackBufferSize is not big enough, it will return false.
+ */
+void GetGroupNormNDTilingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize,
+    const bool isReuseSource, const uint32_t groupNum, optiling::GroupNormTiling& tiling);
+}
+#endif // LIB_NORMALIZATION_GROUPNORM_TILING_H
diff --git a/lib/normalization/groupnorm_tilingdata.h b/lib/normalization/groupnorm_tilingdata.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f12c3358ac2c787eefe1085afa2f44febbd99f9
--- /dev/null
+++ b/lib/normalization/groupnorm_tilingdata.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file groupnorm_tilingdata.h
+ * \brief
+ */
+
+#ifndef LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
+#define LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(GroupNormTiling)
+    TILING_DATA_FIELD_DEF(uint32_t, n);
+    TILING_DATA_FIELD_DEF(uint32_t, c);
+    TILING_DATA_FIELD_DEF(uint32_t, hw);
+    TILING_DATA_FIELD_DEF(uint32_t, g);
+    TILING_DATA_FIELD_DEF(uint32_t, d);
+    TILING_DATA_FIELD_DEF(uint32_t, hwAlignSize);
+    TILING_DATA_FIELD_DEF(uint32_t, dhwAlignSize);
+    TILING_DATA_FIELD_DEF(uint32_t, inputXSize);
+    TILING_DATA_FIELD_DEF(uint32_t, meanVarSize);
+    TILING_DATA_FIELD_DEF(uint32_t, numberOfTmpBuf);
+    TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorPos);
+    TILING_DATA_FIELD_DEF(uint32_t, meanTmpTensorSize);
+    TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorPos);
+    TILING_DATA_FIELD_DEF(uint32_t, varianceTmpTensorSize);
+    TILING_DATA_FIELD_DEF(uint32_t, tmpBufSize);
+    TILING_DATA_FIELD_DEF(uint32_t, oneTmpSize);
+    TILING_DATA_FIELD_DEF(uint32_t, firstTmpStartPos);
+    TILING_DATA_FIELD_DEF(uint32_t, secondTmpStartPos);
+    TILING_DATA_FIELD_DEF(uint32_t, thirdTmpStartPos);
+    TILING_DATA_FIELD_DEF(uint32_t, loopRound);
+    TILING_DATA_FIELD_DEF(uint32_t, inputRoundSize);
+    TILING_DATA_FIELD_DEF(uint32_t, inputTailSize);
+    TILING_DATA_FIELD_DEF(uint32_t, inputTailPos);
+    TILING_DATA_FIELD_DEF(uint32_t, meanVarRoundSize);
+    TILING_DATA_FIELD_DEF(uint32_t, meanVarTailSize);
+    TILING_DATA_FIELD_DEF(uint32_t, meanVarTailPos);
+    TILING_DATA_FIELD_DEF(uint32_t, bshCurLength);
+    TILING_DATA_FIELD_DEF(uint32_t, bsCurLength);
+    TILING_DATA_FIELD_DEF(float, factor);
+    TILING_DATA_FIELD_DEF(bool, smallShape);
+END_TILING_DATA_DEF;
+}
+#endif // LIB_NORMALIZATION_GROUPNORM_TILINGDATA_H
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6d2e7694d3ebae87d71c1eefb1cf935f55ce6e28..80d3de2f3926b0a84fc46dd652a930f87a3072b2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -78,6 +78,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/normalization/batchnorm/test_operator_batchnorm.cpp
     ${ASCENDC_TESTS_DIR}/normalization/deepnorm/test_operator_deepnorm.cpp
     ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernorm.cpp
+    ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_groupnorm.cpp
     # ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgrad.cpp
     ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgradbeta.cpp
     ${ASCENDC_TESTS_DIR}/normalization/rmsnorm/test_operator_rmsnorm.cpp
diff --git a/tests/normalization/groupnorm/test_operator_groupnorm.cpp b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66088fdea6caefc167e6b72252d3a325ab4e437f
--- /dev/null
+++ b/tests/normalization/groupnorm/test_operator_groupnorm.cpp
@@ -0,0 +1,312 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file test_operator_groupnorm.cpp
+ * \brief
+ */
+
+#include <gtest/gtest.h>
+#define private public
+#define protect public
+#include "kernel_operator.h"
+
+using namespace std;
+using namespace AscendC;
+constexpr uint32_t GROUPNORM_SIZEOF_HALF = 2;
+
+constexpr uint32_t GROUPNORM_MAX_MASK_VAL = 64;
+constexpr uint32_t GROUPNORM_STEP_MASK_VAL = 8;
+constexpr uint32_t GROUPNORM_MAX_REPEAT_VAL = 255;
+constexpr uint32_t GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION = 8;
+constexpr uint32_t GROUPNORM_REDUCESUM_MAX_FLOAT_NUM = 64;
+constexpr uint32_t GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE = 8;
+
+__aicore__ inline void GetGroupNormNDTillingInfo(const ShapeInfo& inputShapeInfo, const uint32_t stackBufferSize,
+    const uint32_t typeSize, const bool isReuseSource, const uint32_t groupNum, GroupNormTiling& tiling)
+{
+    
+    uint32_t n = inputShapeInfo.shape[0];
+    uint32_t c = inputShapeInfo.shape[1];
+    uint32_t h = inputShapeInfo.shape[2];
+    uint32_t w = inputShapeInfo.shape[3];
+
+    tiling.n = n;
+    tiling.c = c;
+    tiling.hw = h * w;
+    tiling.g = groupNum;
+    tiling.d = c / tiling.g;
+    tiling.hwAlignSize = (typeSize * tiling.hw + ONE_BLK_SIZE - 1) / 
+    ONE_BLK_SIZE * ONE_BLK_SIZE / typeSize;
+    tiling.dhwAlignSize = tiling.d * tiling.hwAlignSize;
+
+    tiling.inputXSize = n * c * tiling.hwAlignSize;
+    tiling.meanVarSize = n * tiling.g;
+
+    uint32_t oneBlockNum = ONE_BLK_SIZE / B32_BYTE_SIZE;
+    tiling.meanTmpTensorPos = 0;
+    tiling.meanTmpTensorSize = (tiling.meanVarSize + oneBlockNum - 1) / oneBlockNum * oneBlockNum;
+    tiling.varianceTmpTensorPos = tiling.meanTmpTensorSize;
+    tiling.varianceTmpTensorSize = tiling.meanTmpTensorSize;
+
+
+    uint32_t meanVarTotalSize = tiling.meanTmpTensorSize + tiling.varianceTmpTensorSize;
+    if (typeSize == B32_BYTE_SIZE) {
+        meanVarTotalSize = 0;
+    }
+
+    tiling.numberOfTmpBuf = THREE_OF_STACK_BUFFER;
+    if (isReuseSource && (typeSize == B32_BYTE_SIZE)) {
+        tiling.numberOfTmpBuf = TWO_OF_STACK_BUFFER;
+    }
+
+    tiling.tmpBufSize = stackBufferSize / ONE_BLK_SIZE * ONE_BLK_SIZE / B32_BYTE_SIZE;
+    tiling.oneTmpSize = (tiling.tmpBufSize - meanVarTotalSize) / tiling.numberOfTmpBuf;
+
+    // 为了使 MeanVarTensor 可以直接使用 Add 而不需使用 GetValue, 需保证每个迭代至少有8的整数倍组 group
+    tiling.bsCurLength = tiling.oneTmpSize / (GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION * tiling.d * tiling.hwAlignSize) *
+        GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+
+    // 判断是否满足 smallShape 计算
+    uint32_t k = GROUPNORM_REDUCESUM_MAX_REPEAT_SMALLSHAPE;
+    while ((tiling.dhwAlignSize / (ONE_BLK_SIZE / B32_BYTE_SIZE)) % k != 0) {
+        k--;
+    }
+    tiling.smallShape = (tiling.hwAlignSize <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM) && 
+    (tiling.hwAlignSize * tiling.d <= GROUPNORM_REDUCESUM_MAX_FLOAT_NUM * k);
+
+    // ReduceSum0级接口带来的约束, 根据DHW计算2次 ReduceSum 的 mask/repeat, 以及 DHW/bsCurLength 取值范围
+    if (tiling.smallShape) {
+        uint32_t mask1{GROUPNORM_MAX_MASK_VAL};
+        if (tiling.dhwAlignSize > GROUPNORM_MAX_MASK_VAL) {
+            while (tiling.dhwAlignSize % mask1 != 0) {
+                mask1 -= GROUPNORM_STEP_MASK_VAL;
+            }
+        } else {
+            mask1 = tiling.dhwAlignSize;
+        }
+        uint32_t max_bsCurLength = (GROUPNORM_MAX_REPEAT_VAL / (tiling.dhwAlignSize / mask1) / 
+            GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION) * GROUPNORM_MIN_BSCURLENGHT_IN_ITERATION;
+        if (max_bsCurLength < tiling.bsCurLength) {
+            tiling.bsCurLength = max_bsCurLength;
+        }
+    }
+
+    if (typeSize == GROUPNORM_SIZEOF_HALF && tiling.bsCurLength * tiling.dhwAlignSize < c) {
+        return;
+    }
+
+    tiling.oneTmpSize = tiling.bsCurLength * tiling.d * tiling.hwAlignSize;
+
+    if (tiling.oneTmpSize > tiling.inputXSize) {
+        tiling.bsCurLength = tiling.meanVarSize;
+        tiling.oneTmpSize = tiling.inputXSize;
+    }
+
+    if (tiling.oneTmpSize == 0) {
+        return;
+    }
+
+    tiling.firstTmpStartPos = meanVarTotalSize;
+    tiling.secondTmpStartPos = tiling.firstTmpStartPos + tiling.oneTmpSize;
+    tiling.thirdTmpStartPos = tiling.secondTmpStartPos + tiling.oneTmpSize;
+
+    tiling.loopRound = tiling.inputXSize / tiling.oneTmpSize;
+
+    tiling.inputRoundSize = tiling.oneTmpSize;
+    tiling.inputTailSize = tiling.inputXSize % tiling.oneTmpSize;
+    tiling.inputTailPos = tiling.inputXSize - tiling.inputTailSize;
+
+    tiling.meanVarRoundSize = tiling.inputRoundSize / tiling.dhwAlignSize;
+    tiling.meanVarTailSize = tiling.inputTailSize / tiling.dhwAlignSize;
+    tiling.meanVarTailPos = tiling.meanVarSize - tiling.meanVarTailSize;
+
+    tiling.bshCurLength = tiling.inputRoundSize;
+
+    tiling.factor = 1.0f / (tiling.d * tiling.hw);
+    cout << tiling.n << ", " << tiling.c <<  ", " << tiling.hw <<  ", " << tiling.g <<  ", " << tiling.hwAlignSize << endl;
+    cout << "inputXSize: " << tiling.inputXSize << endl;
+    cout << "meanVarSize: " << tiling.meanVarSize << endl;
+    cout << "numberOfTmpBuf: " << tiling.numberOfTmpBuf << endl;
+    cout << "meanTmpTensorPos: " << tiling.meanTmpTensorPos << endl;
+    cout << "varianceTmpTensorPos: " << tiling.varianceTmpTensorPos << endl;
+    cout << "oneTmpSize: " << tiling.oneTmpSize << endl;
+    cout << "firstTmpStartPos: " << tiling.firstTmpStartPos << endl;
+    cout << "thirdTmpStartPos: " << tiling.thirdTmpStartPos << endl;
+    cout << "bsCurLength: " << tiling.bsCurLength << endl;
+    cout << "bshCurLength: " << tiling.bshCurLength << endl;
+    cout << "factor: " << tiling.factor << endl;
+    cout << "hwAlignSize: " << tiling.hwAlignSize << endl;
+    cout << "smallShape: " << tiling.smallShape << endl;
+
+}
+// __aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm,
+//     GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g)
+template <typename dataType, bool isReuseSource = false>
+__aicore__ inline void MainGroupnormTest(GM_ADDR inputXGm, GM_ADDR gammGm, GM_ADDR betaGm, GM_ADDR outputGm,
+    uint32_t n, uint32_t c, uint32_t h, uint32_t w, uint32_t g)
+{
+    dataType epsilon = 0.001;
+    DataFormat dataFormat = DataFormat::ND;
+
+    GlobalTensor<dataType> inputXGlobal;
+    GlobalTensor<dataType> gammGlobal;
+    GlobalTensor<dataType> betaGlobal;
+    GlobalTensor<dataType> outputGlobal;
+    // GlobalTensor<dataType> outputMeanGlobal;
+    // GlobalTensor<dataType> outputVarianceGlobal;
+
+    uint32_t bshLength = n*c*h*w;
+    uint32_t bsLength = g*n;
+
+    inputXGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(inputXGm), bshLength);
+    gammGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(gammGm), c);
+    betaGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(betaGm), c);
+
+    outputGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputGm), bshLength);
+    // outputMeanGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputMeanGm), bsLength);
+    // outputVarianceGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ dataType*>(outputVarianceGm), bsLength);
+
+    TPipe pipe;
+    TQue<QuePosition::VECIN, 1>  inQueueX;
+    TQue<QuePosition::VECIN, 1>  inQueueGamma;
+    TQue<QuePosition::VECIN, 1>  inQueueBeta;
+    TQue<QuePosition::VECOUT, 1> outQueue;
+    // TQue<QuePosition::VECOUT, 1> outQueueMean;
+    // TQue<QuePosition::VECOUT, 1> outQueueVariance;
+    TBuf<QuePosition::VECCALC> meanBuffer, varBuffer;
+
+    uint32_t hwAlignSize = (sizeof(dataType) * h * w + ONE_BLK_SIZE - 1) / ONE_BLK_SIZE * ONE_BLK_SIZE / sizeof(dataType);
+    pipe.InitBuffer(inQueueX,           1, sizeof(dataType) * n * c * hwAlignSize);
+    pipe.InitBuffer(inQueueGamma,       1, (sizeof(dataType) * c + 31) / 32 * 32);
+    pipe.InitBuffer(inQueueBeta,        1, (sizeof(dataType) * c + 31) / 32 * 32);
+    pipe.InitBuffer(outQueue,           1, sizeof(dataType) * n * c * hwAlignSize);
+    // pipe.InitBuffer(outQueueMean,       1, (sizeof(dataType) * g * n + 31) / 32 * 32);
+    // pipe.InitBuffer(outQueueVariance,   1, (sizeof(dataType) * g * n + 31) / 32 * 32);
+    pipe.InitBuffer(meanBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32);
+    pipe.InitBuffer(varBuffer, (sizeof(dataType) * g * n + 31) / 32 * 32);
+
+    LocalTensor<dataType> inputXLocal   = inQueueX.AllocTensor<dataType>();
+    LocalTensor<dataType> gammaLocal    = inQueueGamma.AllocTensor<dataType>();
+    LocalTensor<dataType> betaLocal     = inQueueBeta.AllocTensor<dataType>();
+    LocalTensor<dataType> outputLocal   = outQueue.AllocTensor<dataType>();
+    // LocalTensor<dataType> meanLocal     = outQueueMean.AllocTensor<dataType>();
+    // LocalTensor<dataType> varianceLocal = outQueueVariance.AllocTensor<dataType>();
+    LocalTensor<dataType> meanLocal     = meanBuffer.Get<dataType>();
+    LocalTensor<dataType> varianceLocal = varBuffer.Get<dataType>();
+
+    DataCopyParams copyParams{static_cast<uint16_t>(n*c), static_cast<uint16_t>(h*w*sizeof(dataType)), 0, 0};
+    DataCopyPadParams padParams{true, 0, static_cast<uint8_t>(hwAlignSize - h * w), 0};
+    DataCopyPad(inputXLocal, inputXGlobal, copyParams, padParams);
+    // DataCopy(inputXLocal, inputXGlobal, bshLength);
+    DataCopyParams copyParamsGamma{1, static_cast<uint16_t>(c*sizeof(dataType)), 0, 0};
+    DataCopyPadParams padParamsGamma{false, 0, 0, 0};
+    DataCopyPad(gammaLocal, gammGlobal, copyParamsGamma, padParamsGamma);
+    DataCopyPad(betaLocal, betaGlobal, copyParamsGamma, padParamsGamma);
+
+    // DataCopy(gammaLocal, gammGlobal, c);
+    // DataCopy(betaLocal, betaGlobal, c);
+    PipeBarrier<PIPE_ALL>();
+
+    uint32_t stackBufferSize = 0;
+    {
+        LocalTensor<float> stackBuffer;
+        bool ans = PopStackBuffer<float, TPosition::LCM>(stackBuffer);
+        stackBufferSize = stackBuffer.GetSize();
+    }
+
+    GroupNormTiling groupNormTiling;
+    uint32_t inputShape[4] = {n, c, h, w};
+    ShapeInfo shapeInfo{ (uint8_t)4, inputShape, (uint8_t)4, inputShape, dataFormat };
+
+    GetGroupNormNDTillingInfo(shapeInfo, stackBufferSize, sizeof(dataType), isReuseSource, g, groupNormTiling);
+
+    GroupNorm<dataType, isReuseSource>(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
+        (dataType)epsilon, groupNormTiling);
+    PipeBarrier<PIPE_ALL>();
+
+    // DataCopy(outputGlobal, outputLocal, bshLength);
+    DataCopyPad(outputGlobal, outputLocal, copyParams);
+    // DataCopy(outputMeanGlobal, meanLocal, bsLength);
+    // DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
+
+    inQueueX.FreeTensor(inputXLocal);
+    inQueueGamma.FreeTensor(gammaLocal);
+    inQueueBeta.FreeTensor(betaLocal);
+    outQueue.FreeTensor(outputLocal);
+    // outQueueMean.FreeTensor(meanLocal);
+    // outQueueVariance.FreeTensor(varianceLocal);
+    PipeBarrier<PIPE_ALL>();
+}
+
+struct groupnormTestParams {
+    uint32_t n;
+    uint32_t c;
+    uint32_t h;
+    uint32_t w;
+    uint32_t g;
+    uint32_t typeSize;
+    void (*cal_func)(uint8_t*, uint8_t*, uint8_t*, uint8_t*, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
+};
+
+class groupnormTestSuite : public testing::Test, public testing::WithParamInterface<groupnormTestParams> {
+protected:
+    static void SetUpTestCase()
+    {
+        std::cout << "groupnormTestSuite SetUpTestCase" << std::endl;
+    }
+    static void TearDownTestCase()
+    {
+        std::cout << "groupnormTestSuite TearDownTestCase" << std::endl;
+    }
+    virtual void SetUp() {}
+    virtual void TearDown() {}
+};
+
+INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_groupnorm, groupnormTestSuite,
+    ::testing::Values(
+    groupnormTestParams { 2, 16, 8, 8, 4, sizeof(float), MainGroupnormTest<float, false> },
+    groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest<half, false> },
+    groupnormTestParams { 2, 16, 9, 9, 4, sizeof(float), MainGroupnormTest<float, false> },
+    groupnormTestParams { 2, 16, 9, 9, 4, sizeof(half), MainGroupnormTest<half, false> },
+    groupnormTestParams { 2, 16, 8, 8, 4, sizeof(float), MainGroupnormTest<float, true> },
+    groupnormTestParams { 2, 16, 9, 9, 4, sizeof(float), MainGroupnormTest<float, true> },
+    groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest<half, true> },
+    groupnormTestParams { 2, 16, 8, 8, 4, sizeof(half), MainGroupnormTest<half, true> }
+    ));
+
+TEST_P(groupnormTestSuite, GroupnormTestCase)
+{
+    auto param = GetParam();
+
+    uint32_t n = param.n;
+    uint32_t c = param.c;
+    uint32_t h = param.h;
+    uint32_t w = param.w;
+    uint32_t g = param.g;
+    uint32_t typeSize = param.typeSize;
+
+    uint32_t bshLength = n * c * h * w;
+    uint32_t bsLength = n * c / g;
+
+    uint8_t inputXGm[bshLength * typeSize] { 0x00 };
+    uint8_t gammGm[c * typeSize] { 0x00 };
+    uint8_t betaGm[c * typeSize] { 0x00 };
+
+    uint8_t outputGm[bshLength * typeSize] {0x00};
+    // uint8_t outputMeanGm[bsLength * typeSize] {0x00};
+    // uint8_t outputVarianceGm[bsLength * typeSize] {0x00};
+
+    param.cal_func(inputXGm, gammGm, betaGm, outputGm, n, c, h, w, g);
+
+    for (int32_t i = 0; i < bshLength * typeSize; i++) {
+        EXPECT_EQ(outputGm[i], 0x00);
+    }
+}
\ No newline at end of file
diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index ec41be3a6f16c1445dd5f234d25ca81221eccb0e..1013bdcac23b29ad29716d3d8e4321a551392789 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -1396,6 +1396,28 @@ TEST_F(TestTiling, TestLayernormTiling)
     EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
 }
 
+TEST_F(TestTiling, TestGroupnormTiling)
+{
+    const uint32_t stackBufferSize = 100 * 1024;
+    const uint32_t typeSize = 4;
+    const uint32_t groupNum = 4;
+
+    std::vector<int64_t> shapeDims = { 16, 16, 8, 8};
+    auto groupnormShape = ge::Shape(shapeDims);
+    const bool isReuseSource = false;
+    optiling::GroupNormTiling tilling;
+
+    uint32_t minValue = 0;
+    uint32_t maxValue = 0;
+
+    AscendC::GetGroupNormMaxMinTmpSize(groupnormShape, typeSize, isReuseSource, groupNum, maxValue, minValue);
+    EXPECT_EQ(maxValue, 3 * (16 * 16 * 8 * 8) * typeSize + 2 * groupNum * 16 * typeSize);
+    EXPECT_EQ(minValue, 3 * (16 / 4 * 8 * 8) * typeSize + 2 * groupNum * 16 * typeSize);
+
+    AscendC::GetGroupNormNDTilingInfo(groupnormShape, stackBufferSize, typeSize, isReuseSource, groupNum, tilling);
+    EXPECT_EQ(tilling.get_tmpBufSize(), stackBufferSize / sizeof(float));
+}
+
 TEST_F(TestTiling, TestRmsnormTiling)
 {
     constexpr uint32_t bLength = 4;