diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake
index 376dc2ba07493f721de1273b7ced06d4bd22223d..195863d2b194fbc88002b2a8b5e07634b7638856 100644
--- a/cmake/kernel_headers.cmake
+++ b/cmake/kernel_headers.cmake
@@ -196,3 +196,7 @@ file(CREATE_LINK ../activation/geglu_tiling_intf.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling_intf.h SYMBOLIC)
 file(CREATE_LINK ../activation/geglu_tiling.h
         ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling.h SYMBOLIC)
+#initglobalmemory
+file(MAKE_DIRECTORY  ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/init_global_memory)
+file(CREATE_LINK ../utils/init_global_memory.h
+        ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/init_global_memory/init_global_memory.h SYMBOLIC)
diff --git a/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h b/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h
index 11ce1407789a550c5dbf9e3eeaaffe493fd8a3bb..aaf66a76acd7e2b8db4a3b8a6d68263300194fdb 100644
--- a/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h
+++ b/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h
@@ -29,10 +29,10 @@ class BasicBlockMatmulKernel {
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool hasBias = false>
         __aicore__ inline void Process(AscendC::TPipe* pipe);
-        matmul::Matmul<matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, true>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, MM_CFG> matmulObj;
+        AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, true>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, MM_CFG> matmulObj;
 
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, BasicBlockMatrixOffset& matrixOffset,
diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
index 065f151cbf3218348ab4d7dcc40ec59f5a3aada0..8b98232b0fe9107d7ce8e469e2b91fbfd1114d5b 100644
--- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
+++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
@@ -22,7 +22,7 @@ class BatchMatmulKernel {
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool hasBias = false>
         __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB);
-        matmul::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> matmulObj;
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> matmulObj;
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
                                           int32_t& offsetC, int32_t& offsetBias);
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
index 94206b3efc982492ca4661f50a8f7e299de3197e..6efea394e89347ac1870681718e1e72efb29776d 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
@@ -31,10 +31,10 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
 {
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
     AscendC::TPipe pipe;
     tiling.shareMode = 0;             // 0, share mode
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
index c55caba16e0cc356523fc44b7dc8c61a0f27c201..056cd2f5b70b90060169ba6e04d5711b6f870a22 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
@@ -17,10 +17,10 @@ constexpr int32_t FULL_L0C_SIZE = 128 * 1024;
 
 extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
     GET_TILING_DATA(tilingData, tiling);
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
-    typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
+    typedef AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
     AscendC::TPipe pipe;
     tilingData.cubeTilingData.shareMode = 0;             // 0, share mode
diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
index 6f85560549f5a1a7274538d03d0d32ba95aae833..a182606025c370d650ef8ff45d4a75c13a425622 100644
--- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
+++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
@@ -20,10 +20,10 @@ class MatmulKernel {
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool setTmpSpace = false, bool hasBias = false>
         __aicore__ inline void Process(AscendC::TPipe* pipe);
-        matmul::Matmul<matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, true>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
-        matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, CFG_MDL> matmulObj;
+        AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, true>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
+        AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, CFG_MDL> matmulObj;
 
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
diff --git a/examples/normalization/welford_update/README.md b/examples/normalization/welford_update/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eaa60bc71fe98647905e2eb6b5a7a3e15e1983a8
--- /dev/null
+++ b/examples/normalization/welford_update/README.md
@@ -0,0 +1,70 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用WelfordUpdate高阶API实现welford_update单算子，并按照不同的算子调用方式分别给出了对应的端到端实现。
+
+- 直调：使用核函数直调WelfordUpdate自定义算子。
+
+  核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。
+
+- 框架调用：使用框架调用welford_update自定义算子。
+
+  按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程：基于工程代码框架完成算子核函数的开发和Tiling实现，通过工程编译脚本完成算子的编译部署，继而实现单算子调用或第三方框架中的算子调用。
+
+本样例中包含如下调用方式：
+
+| 调用方式  | 目录                                                         | **描述**                                                   |
+| --------- | ------------------------------------------------------------ | ---------------------------------------------------------- |
+| 直调    | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | host侧的核函数调用程序，包含CPU侧、NPU侧、仿真侧三种运行验证方法。 |
+| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用welford_update算子。                       |
+
+## 样例支持的产品型号为：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+- Atlas推理系列产品AI Core
+- Atlas A3 训练系列产品
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | 通过kernel直调的方式调用自定义算子工程样例目录               |
+| [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用自定义算子工程样例目录                |
+| [host_tiling](./host_tiling)                                 | 本样例tiling代码实现 |
+| [kernel_impl](./kernel_impl)                                 | 本样例kernel侧代码实现                                       |
+
+## 算子描述
+
+welford_update单算子，对输入tensor做WelfordUpdate计算。Welford是一种在线计算均值和方差的方法。一方面，它可以在不存储所有样本的情况下，逐步计算所有样本的均值和方差，更适合处理海量数据；另一方面，它只需要对数据进行一次遍历，能减少访存次数，提高计算性能。WelfordUpdate接口为Welford算法的前处理。
+
+welford_update算子规格：
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">WelfordUpdateCustom</td></tr>
+
+<tr><td rowspan="5" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">srcGm</td><td align="center">1*64</td><td align="center">half</td><td align="center">ND</td></tr>
+<tr><td align="center">inMeanGm</td><td align="center">1*64</td><td align="center">half</td><td align="center">ND</td></tr>
+<tr><td align="center">inVarGm</td><td align="center">1*64</td><td align="center">half</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="3" align="center">算子输出</td></tr>
+<tr><td align="center">outMeanGm</td><td align="center">1*64</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">outVarGm</td><td align="center">1*64</td><td align="center">float</td><td align="center">ND</td></tr>
+
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">welford_update_custom</td></tr>
+</table>
+
+## 算子实现介绍
+
+本样例实现了welford_update算子。
+
+- kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用WelfordUpdate高阶API接口完成welford_update计算，得到最终结果，再搬出到外部存储上。
+
+  welford_update算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行welford_update计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- tiling实现
+
+  welford_update算子的tiling实现流程如下：首先获取welford_update接口能完成计算所需最大/最小临时空间大小，根据该范围结合实际的内存使用情况设置合适的空间大小，然后根据输入长度dataLength确定所需tiling参数。
\ No newline at end of file
diff --git a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef4335a3f3c62e6f66b142877586f84db95a22fc
--- /dev/null
+++ b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
+#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(WelfordUpdateCustomTilingData)
+    TILING_DATA_FIELD_DEF(bool, inplace);
+    TILING_DATA_FIELD_DEF(uint32_t, nLength);
+    TILING_DATA_FIELD_DEF(uint32_t, rLength);
+    TILING_DATA_FIELD_DEF(uint32_t, abComputeLength);
+    TILING_DATA_FIELD_DEF(float, nRec);
+    TILING_DATA_FIELD_DEF(uint32_t, tmpLocalSize);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(WelfordUpdateCustom, WelfordUpdateCustomTilingData)
+} // namespace optiling
+
+constexpr bool ISREUSESOURCE = false;
+constexpr bool ISINPLACE = true;
+
+void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength,
+    float nRec, optiling::WelfordUpdateCustomTilingData &tiling)
+{
+    std::vector<int64_t> shapeVec = {nLength, rLength};
+    ge::Shape srcShape(shapeVec);
+    uint32_t maxsize = 0;
+    uint32_t minsize = 0;
+    uint32_t dtypesizeT = 2;  // half类型
+    uint32_t dtypesizeU = 4;  // float类型
+
+    tiling.set_inplace(inplace);
+    tiling.set_nLength(nLength);
+    tiling.set_rLength(rLength);
+    tiling.set_abComputeLength(abComputeLength);
+    tiling.set_nRec(nRec);
+
+    AscendC::GetWelfordUpdateMaxMinTmpSize(srcShape, dtypesizeT, dtypesizeU, ISREUSESOURCE, ISINPLACE, maxsize,
+        minsize);
+    tiling.set_tmpLocalSize(minsize);
+}
+
+#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
diff --git a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h
new file mode 100644
index 0000000000000000000000000000000000000000..19028d91e778fad692d03a58a99741e8417275c3
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h
@@ -0,0 +1,154 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the
+ * "License"). Please refer to the License for details. You may not use this
+ * file except in compliance with the License. THIS SOFTWARE IS PROVIDED ON AN
+ * "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS
+ * FOR A PARTICULAR PURPOSE. See LICENSE in the root of the software repository
+ * for the full text of the License.
+ */
+
+#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H
+#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+struct VecTiling {
+    bool inplace;
+    uint32_t nLength;
+    uint32_t rLength;
+    uint32_t abComputeLength;
+    float nRec;
+    uint32_t tmpLocalSize;
+};
+
+constexpr AscendC::WelfordUpdateConfig WELFORD_UPDATE_ENABLE_INPLACE_CFG = {true};
+constexpr AscendC::WelfordUpdateConfig WELFORD_UPDATE_UNENABLE_INPLACE_CFG = {false};
+constexpr uint8_t LOCAL_BYTES = 32;
+
+template <typename T, typename U, bool isReuseSource = false, bool tmpLocal = true>
+class KernelWelfordUpdate {
+public:
+    __aicore__ inline KernelWelfordUpdate() {}
+    __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR outputMean_gm,
+        GM_ADDR outputVar_gm, VecTiling tilingData) {
+        nLength = tilingData.nLength;
+        rLength = tilingData.rLength;
+        abComputeLength = tilingData.abComputeLength;
+        nRec = tilingData.nRec;
+        bshLength = tilingData.nLength * tilingData.rLength;
+        inplace = tilingData.inplace;
+        tmpLocalBytes = tilingData.tmpLocalSize;
+
+        inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), bshLength);
+        inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength);
+        inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength);
+
+        outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength);
+        outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength);
+
+        pipe.InitBuffer(inQueueX, 1, sizeof(T) * bshLength);
+        pipe.InitBuffer(inQueueMean, 1, sizeof(U) * bshLength);
+        pipe.InitBuffer(inQueueVar, 1, sizeof(U) * bshLength);
+        pipe.InitBuffer(outQueueMean, 1, sizeof(U) * bshLength);
+        pipe.InitBuffer(outQueueVar, 1, sizeof(U) * bshLength);
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn() {
+        AscendC::LocalTensor<T> inputXLocal = inQueueX.AllocTensor<T>();
+        AscendC::LocalTensor<U> inMeanLocal = inQueueMean.AllocTensor<U>();
+        AscendC::LocalTensor<U> inVarLocal = inQueueVar.AllocTensor<U>();
+
+        AscendC::DataCopy(inputXLocal, inputX_global, bshLength);
+        AscendC::DataCopy(inMeanLocal, inputMean_global, bshLength);
+        AscendC::DataCopy(inVarLocal, inputVar_global, bshLength);
+
+        inQueueX.EnQue(inputXLocal);
+        inQueueMean.EnQue(inMeanLocal);
+        inQueueVar.EnQue(inVarLocal);
+    }
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<T> inputXLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<U> inMeanLocal = inQueueMean.DeQue<U>();
+        AscendC::LocalTensor<U> inVarLocal = inQueueVar.DeQue<U>();
+
+        AscendC::LocalTensor<U> outMeanLocal = outQueueMean.AllocTensor<U>();
+        AscendC::LocalTensor<U> outVarLocal = outQueueVar.AllocTensor<U>();
+
+        struct AscendC::WelfordUpdateParam para = {nLength, rLength, abComputeLength, nRec};
+        if (!tmpLocal) {
+            if (inplace) {
+                AscendC::WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_ENABLE_INPLACE_CFG>(outMeanLocal,
+                    outVarLocal, inMeanLocal, inVarLocal, inputXLocal, para);
+            } else {
+                AscendC::WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_UNENABLE_INPLACE_CFG>(outMeanLocal,
+                    outVarLocal, inMeanLocal, inVarLocal, inputXLocal, para);
+            }
+        } else {
+            if (tmpLocalBytes % LOCAL_BYTES != 0) {
+                tmpLocalBytes = (tmpLocalBytes + LOCAL_BYTES - 1) / LOCAL_BYTES * LOCAL_BYTES;
+            }
+            pipe.InitBuffer(tmpLocalBuf, tmpLocalBytes);
+            AscendC::LocalTensor<uint8_t> tmpLocalTensor = tmpLocalBuf.Get<uint8_t>();
+            if (inplace) {
+                AscendC::WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_ENABLE_INPLACE_CFG>(outMeanLocal,
+                    outVarLocal, inMeanLocal, inVarLocal, inputXLocal, tmpLocalTensor, para);
+            } else {
+                AscendC::WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_UNENABLE_INPLACE_CFG>(outMeanLocal,
+                    outVarLocal, inMeanLocal, inVarLocal, inputXLocal, tmpLocalTensor, para);
+            }
+        }
+
+        inQueueX.FreeTensor(inputXLocal);
+        inQueueMean.FreeTensor(inMeanLocal);
+        inQueueVar.FreeTensor(inVarLocal);
+
+        outQueueMean.EnQue(outMeanLocal);
+        outQueueVar.EnQue(outVarLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        AscendC::LocalTensor<U> outMeanLocal = outQueueMean.DeQue<U>();
+        AscendC::LocalTensor<U> outVarLocal = outQueueVar.DeQue<U>();
+
+        AscendC::DataCopy(outputMean_global, outMeanLocal, bshLength);
+        AscendC::DataCopy(outputVar_global, outVarLocal, bshLength);
+
+        outQueueMean.FreeTensor(outMeanLocal);
+        outQueueVar.FreeTensor(outVarLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueMean;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueVar;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueMean;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueVar;
+
+    AscendC::GlobalTensor<T> inputX_global;
+    AscendC::GlobalTensor<U> inputMean_global;
+    AscendC::GlobalTensor<U> inputVar_global;
+    AscendC::GlobalTensor<U> outputMean_global;
+    AscendC::GlobalTensor<U> outputVar_global;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tmpLocalBuf;
+
+    uint32_t tmpLocalBytes = 0;
+    uint32_t nLength;
+    uint32_t rLength;
+    uint32_t abComputeLength;
+    float nRec;
+    uint32_t bshLength;
+    bool inplace;
+};
+
+} // namespace MyCustomKernel
+
+#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..239d5bf2b6d5d779ece664faac2a592861eaf25f
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/welford_update_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(welford_update_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/welford_update_custom_tiling.cpp
+)
+
+target_compile_options(welford_update_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(welford_update_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(welford_update_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(welford_update_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS welford_update_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md b/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1f879714cbab08baed33b2971565159c977cd
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md
@@ -0,0 +1,50 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例基于Kernel直调算子工程，介绍了调用WelfordUpdate高阶API实现welford_update单算子，主要演示WelfordUpdate高阶API在Kernel直调工程中的调用。
+
+## 目录结构介绍
+| 目录及文件                  | 描述                   |
+|---------------------|----------------------|
+| [cmake](./cmake)      | 编译工程文件 |
+| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 |
+| main.cpp | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| welford_update_custom.cpp | 算子kernel实现 |
+| welford_update_custom_tiling.cpp | 算子tiling实现 |
+| run.sh | 编译执行脚本 |
+| CMakeLists.txt | 编译工程文件 |
+
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN包的存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    ```
+    若执行sim仿真，可自行配置仿真日志文件目录，默认仿真日志会在build目录下生成。若需要详细了解sim仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的 工具使用 章节。
+    ```
+    # 设置仿真模式日志生成目录（可选），需要自行确保设置的目录已存在。若设置为相对路径下的目录，则以程序执行时的目录作为当前目录。例如，执行如下设置时，需要确保./目录下存在xxx目录
+    export CAMODEL_LOG_PATH=./xxx
+    ```
+
+  - 生成输入和真值
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+    其中cmake参数说明如下：
+    - RUN_MODE ：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim/ npu]
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。
+
+    示例如下：
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..98413a61adc75e01ac5967f9c61d66e174777237
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake
@@ -0,0 +1,19 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE 
+  $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+  -DASCENDC_DUMP
+  -DHAVE_WORKSPACE
+  -DHAVE_TILING
+  )
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2dd7870535cedbf94f3a550453c5383154d71bb
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp
@@ -0,0 +1,208 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../../common/data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void welford_update_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *srcGm, uint8_t *inMeanGm,
+    uint8_t *inVarGm, uint8_t *outMeanGm, uint8_t *outVarGm, uint8_t *workspace, uint8_t *tiling);
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm,
+    GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling);
+#endif
+constexpr uint8_t BLOCK_DIM = 1;
+constexpr uint32_t TILINGDATA_SIZE = 6;
+constexpr uint32_t WORKSPACE_SIZE = 1024 * 1024;
+
+constexpr bool ISINPLACE = true;
+constexpr uint8_t RN_SIZE = 1;
+constexpr uint32_t AB_SIZE = 64;
+constexpr uint32_t AB_LENGTH = 35;
+constexpr float NREC = 1.0 / 8;
+
+extern uint8_t *GenerateTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength,
+    float nRec);
+
+static bool CompareResult(const void *outputData, int64_t outSize, std::string goldenName)
+{
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden_" + goldenName + ".bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden_%s.bin success!\n", goldenName.c_str());
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden_%s.bin failed output is %lf, golden is %lf\n", goldenName.c_str(), a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        return false;
+    } else {
+        printf("CompareResult golden_%s.bin success!\n", goldenName.c_str());
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = BLOCK_DIM;
+    size_t inputSrcSize = RN_SIZE * AB_SIZE * sizeof(float);
+    size_t inputMeanSize = RN_SIZE * AB_SIZE * sizeof(float);
+    size_t inputVarSize = RN_SIZE * AB_SIZE * sizeof(float);
+    size_t outputMeanSize = RN_SIZE * AB_SIZE * sizeof(float);
+    size_t outputVarSize = RN_SIZE * AB_SIZE * sizeof(float);
+
+    size_t workspaceSize = WORKSPACE_SIZE;
+    size_t tilingFileSize = TILINGDATA_SIZE * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *inputSrc = (uint8_t *)AscendC::GmAlloc(inputSrcSize);
+    uint8_t *inputMean = (uint8_t *)AscendC::GmAlloc(inputMeanSize);
+    uint8_t *inputVar = (uint8_t *)AscendC::GmAlloc(inputVarSize);
+    uint8_t *outputMean = (uint8_t *)AscendC::GmAlloc(outputMeanSize);
+    uint8_t *outputVar = (uint8_t *)AscendC::GmAlloc(outputVarSize);
+
+    uint8_t *workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize);
+
+    ReadFile("../input/input_srcGm.bin", inputSrcSize, inputSrc, inputSrcSize);
+    ReadFile("../input/input_inMeanGm.bin", inputMeanSize, inputMean, inputMeanSize);
+    ReadFile("../input/input_inVarGm.bin", inputVarSize, inputVar, inputVarSize);
+
+    memcpy_s(tiling, tilingFileSize, GenerateTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC), tilingFileSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(welford_update_custom, blockDim, inputSrc, inputMean, inputVar, outputMean, outputVar, workspace,
+        tiling);
+ 
+    WriteFile("../output/output_outMeanGm.bin", outputMean, outputMeanSize);
+    WriteFile("../output/output_outVarGm.bin", outputVar, outputVarSize);
+
+    bool goldenResult = true;
+    goldenResult &= CompareResult(outputMean, outputMeanSize, "outMeanGm");
+    goldenResult &= CompareResult(outputVar, outputVarSize, "outVarGm");
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+
+    AscendC::GmFree((void *)inputSrc);
+    AscendC::GmFree((void *)inputMean);
+    AscendC::GmFree((void *)inputVar);
+    AscendC::GmFree((void *)outputMean);
+    AscendC::GmFree((void *)outputVar);
+    AscendC::GmFree((void *)workspace);
+    AscendC::GmFree((void *)tiling);
+
+#else
+    CHECK_ACL(aclInit(nullptr));
+    aclrtContext context;
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *srcHost, *inMeanHost, *inVarHost, *outMeanHost, *outVarHost, *workspaceHost;
+    uint8_t *srcDevice, *inMeanDevice, *inVarDevice, *outMeanDevice, *outVarDevice, *workspaceDevice, *tilingDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&srcHost), inputSrcSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&inMeanHost), inputMeanSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&inVarHost), inputVarSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&outMeanHost), outputMeanSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&outVarHost), outputVarSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&workspaceHost), workspaceSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&srcDevice, inputSrcSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&inMeanDevice, inputMeanSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&inVarDevice, inputVarSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&outMeanDevice, outputMeanSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&outVarDevice, outputVarSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_srcGm.bin", inputSrcSize, srcHost, inputSrcSize);
+    ReadFile("../input/input_inMeanGm.bin", inputMeanSize, inMeanHost, inputMeanSize);
+    ReadFile("../input/input_inVarGm.bin", inputVarSize, inVarHost, inputVarSize);
+
+    CHECK_ACL(aclrtMemcpy(workspaceDevice, workspaceSize, workspaceHost, workspaceSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, GenerateTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC),
+        tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    CHECK_ACL(aclrtMemcpy(srcDevice, inputMeanSize, srcHost, inputMeanSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(inMeanDevice, inputMeanSize, inMeanHost, inputMeanSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(inVarDevice, inputVarSize, inVarHost, inputVarSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    welford_update_custom_do(blockDim, nullptr, stream, srcDevice, inMeanDevice, inVarDevice, outMeanDevice,
+        outVarDevice, workspaceDevice, tilingDevice);
+
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(outMeanHost, outputMeanSize, outMeanDevice, outputMeanSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(outVarHost, outputVarSize, outVarDevice, outputVarSize, ACL_MEMCPY_DEVICE_TO_HOST));
+
+    WriteFile("../output/output_outMeanGm.bin", outMeanHost, outputMeanSize);
+    WriteFile("../output/output_outVarGm.bin", outVarHost, outputVarSize);
+
+    bool goldenResult = true;
+    goldenResult &= CompareResult(outMeanHost, outputMeanSize, "outMeanGm");
+    goldenResult &= CompareResult(outVarHost, outputVarSize, "outVarGm");
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+
+    CHECK_ACL(aclrtFree(srcDevice));
+    CHECK_ACL(aclrtFree(inMeanDevice));
+    CHECK_ACL(aclrtFree(inVarDevice));
+    CHECK_ACL(aclrtFree(outMeanDevice));
+    CHECK_ACL(aclrtFree(outVarDevice));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+
+    CHECK_ACL(aclrtFreeHost(srcHost));
+    CHECK_ACL(aclrtFreeHost(inMeanHost));
+    CHECK_ACL(aclrtFreeHost(inVarHost));
+    CHECK_ACL(aclrtFreeHost(outMeanHost));
+    CHECK_ACL(aclrtFreeHost(outVarHost));
+    CHECK_ACL(aclrtFreeHost(workspaceHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh b/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f8305ad320d1d68e09a8a1d825808e077c4d06cb
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=./ # default log save in build dir
+    else
+        export CAMODEL_LOG_PATH=../$CAMODEL_LOG_PATH
+        rm -rf $CAMODEL_LOG_PATH
+        mkdir -p $CAMODEL_LOG_PATH
+    fi
+fi
+
+if [ "${RUN_MODE}" = "cpu" ]; then
+    export CAMODEL_LOG_PATH=./ # cpu run mode set fixed log path
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+./welford_update_direct_kernel_op
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65af46cbbeeeb50bf8f8c9e0a727761512c7a31
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+RN_SIZE = 1
+AB_SIZE = 64
+AB_LENGTH = 35
+NREC = 1.0 / 8
+
+def gen_golden_data_simple():
+    x1 = np.random.uniform(1, 100, [RN_SIZE * AB_SIZE]).astype(np.float16)
+    x2 = np.random.uniform(-60000, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32)
+    x3 = np.random.uniform(0, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32)
+    golden1 = x2.copy()
+    golden2 = x3.copy()
+
+    for i in range(AB_LENGTH):
+        n = np.float32(NREC)
+        golden1[i] = x2[i] + (x1[i] - x2[i]) * n
+        golden2[i] = x3[i] + (x1[i] - x2[i]) * (x1[i] - golden1[i])
+
+    os.system("mkdir -p ./input")
+    x1.tofile("./input/input_srcGm.bin")
+    x2.tofile("./input/input_inMeanGm.bin")
+    x3.tofile("./input/input_inVarGm.bin")
+    os.system("mkdir -p ./output")
+    golden1.tofile("./output/golden_outMeanGm.bin")
+    golden2.tofile("./output/golden_outVarGm.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a486b65e87cc2394444321514e765aee5fdc9a82
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp
@@ -0,0 +1,45 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "kernel_operator.h"
+#include "../kernel_impl/welford_update_custom.h"
+
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm,
+    GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    MyCustomKernel::KernelWelfordUpdate<half, float> op;
+    MyCustomKernel::VecTiling tilingData;
+    CopyTiling(&tilingData, tiling);
+    op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, tilingData);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+// call of kernel function
+void welford_update_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *srcGm, uint8_t *inMeanGm,
+    uint8_t *inVarGm, uint8_t *outMeanGm, uint8_t *outVarGm, uint8_t *workspace, uint8_t *tiling)
+{
+    welford_update_custom<<<blockDim, l2ctrl, stream>>>(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, workspace,
+        tiling);
+}
+#endif
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ff63ec66c63c1b5f9cef530f248211bcbed455b
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include "tiling/tiling_api.h"
+#include "../host_tiling/welford_update_custom_tiling.h"
+
+uint8_t *GetTilingBuf(optiling::WelfordUpdateCustomTilingData *tilingData) {
+    uint32_t tilingSize = sizeof(optiling::WelfordUpdateCustomTilingData);
+    uint8_t *buf = (uint8_t *)malloc(tilingSize);
+    tilingData->SaveToBuffer(buf, tilingSize);
+    return buf;
+}
+uint8_t* GenerateTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength, float nRec)
+{
+    optiling::WelfordUpdateCustomTilingData tiling;
+    ComputeTiling(inplace, nLength, rLength, abComputeLength, nRec, tiling);
+    return GetTilingBuf(&tiling);
+}
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..584132d80993d309434fb1303de83910a1989aba
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+if(ENABLE_CROSS_COMPILE)
+    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64)
+        set(CROSS_COMPILE_PLATFORM aarch64)
+    else()
+        set(CROSS_COMPILE_PLATFORM x86_64)
+    endif()
+    set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR})
+    set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/)
+    set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/)
+    set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM})
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+    set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER})
+else()
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+endif()
+
+include(cmake/config.cmake)
+include(cmake/func.cmake)
+include(cmake/intf.cmake)
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
+    add_subdirectory(framework)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
+    add_subdirectory(op_host)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
+    add_subdirectory(op_kernel)
+endif()
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# modify vendor_name in install.sh and upgrade.sh
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/
+    COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/*
+)
+add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)
+install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)
+
+install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(gen_version_info ALL
+        COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
+        DESTINATION packages/vendors/${vendor_name}/)
+
+# CPack config
+set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
+set(CPACK_GENERATOR External)
+set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
+include(CPack)
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json
new file mode 100644
index 0000000000000000000000000000000000000000..e56e9011dff02062a1fe85fc32c85e0205c65b24
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json
@@ -0,0 +1,63 @@
+{
+    "version": 1,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 19,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "Default Config",
+            "description": "Default build using Unix Makefiles generator",
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/build_out",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": {
+                    "type": "STRING",
+                    "value": "Release"
+                },
+                "ENABLE_SOURCE_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ENABLE_BINARY_PACKAGE": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "ASCEND_COMPUTE_UNIT": {
+                    "type": "STRING",
+                    "value": "ascend310p;ascend910b"
+                },
+                "ENABLE_TEST": {
+                    "type": "BOOL",
+                    "value": "True"
+                },
+                "vendor_name": {
+                    "type": "STRING",
+                    "value": "customize"
+                },
+                "ASCEND_CANN_PACKAGE_PATH": {
+                    "type": "PATH",
+                    "value": "~/Ascend/ascend-toolkit/latest"
+                },
+                "ASCEND_PYTHON_EXECUTABLE": {
+                    "type": "STRING",
+                    "value": "python3"
+                },
+                "CMAKE_INSTALL_PREFIX": {
+                    "type": "PATH",
+                    "value": "${sourceDir}/build_out"
+                },
+                "ENABLE_CROSS_COMPILE": {
+                    "type": "BOOL",
+                    "value": "False"
+                },
+                "CMAKE_CROSS_PLATFORM_COMPILER": {
+                    "type": "PATH",
+                    "value": "/usr/bin/aarch64-linux-gnu-g++"
+                }
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md b/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b189f7f96961df1c95bd72f02cda3107b5774bc
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md
@@ -0,0 +1,80 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例基于自定义算子工程，介绍了调用WelfordUpdate高阶API实现welford_update单算子，主要演示WelfordUpdate高阶API在自定义算子工程中的调用。
+
+## 目录结构
+| 目录                  | 描述                   |
+|---------------------|----------------------|
+| [cmake](./cmake)      | 编译工程文件 |
+| [op_host](./op_host)       | host侧实现文件 |
+| [op_kernel](./op_kernel) | kernel侧实现文件 |
+| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 |
+| [testcases](./testcases) | 包含cpu域以及npu域的用例主函数，以及真值校验函数 |
+| build.sh | 编译运行算子的脚本 |
+| CMakeLists.txt | 编译工程文件 |
+| CMakePresets.json | 编译工程配置文件 |
+
+## 编译运行样例
+
+## 1.配置环境变量
+
+  这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN包的存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+  ```
+  export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+  source $ASCEND_HOME_DIR/../set_env.sh
+  ```
+### 2.生成输入和真值
+  ```
+  python3 scripts/gen_data.py
+  ```
+
+### 3.编译算子工程
+
+  - 修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装后的实际路径。
+
+
+    ```
+    {
+        ……
+        "configurePresets": [
+            {
+                    ……
+                    "ASCEND_CANN_PACKAGE_PATH": {
+                        "type": "PATH",
+                        "value": "~/Ascend/ascend-toolkit/latest"  //请替换为CANN软件包安装后的实际路径。eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest
+                    },
+                    ……
+            }
+        ]
+    }
+    ```
+  - 在当前算子工程目录下执行如下命令，进行算子工程编译。
+
+    ```
+    bash build.sh
+    ```
+    编译成功后，会在当前目录下创建build_out目录，并在build_out目录下生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+
+### 4.部署算子包
+
+执行如下命令，在自定义算子安装包所在路径下，安装自定义算子包。
+
+```
+cd build_out
+./custom_opp_<target os>_<target architecture>.run
+```
+
+命令执行成功后，自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。
+
+### 5.执行样例
+在build_out目录下执行如下命令
+
+```
+./welford_update_custom_npu
+```
+    
+### 注意事项
+本样例工程会自动识别执行的硬件平台，无需单独设置SOC_VERSION
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6f3ab96208740ee1ddcd51c739a9adea2fe9bc52
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+script_path=$(realpath $(dirname $0))
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+cp -rf ../host_tiling/* op_host/
+ln -s $ASCEND_HOME_DIR/tools/op_project_templates/ascendc/customize/cmake/util/ ./cmake/util
+mkdir -p build_out
+rm -rf build_out/*
+cd build_out
+
+opts=$(python3 $script_path/cmake/util/preset_parse.py $script_path/CMakePresets.json)
+ENABLE_CROSS="-DENABLE_CROSS_COMPILE=True"
+ENABLE_BINARY="-DENABLE_BINARY_PACKAGE=True"
+cmake_version=$(cmake --version | grep "cmake version" | awk '{print $3}')
+
+cmake_run_package()
+{
+  target=$1
+  cmake --build . --target $target -j16
+  if [ $? -ne 0 ]; then exit 1; fi
+
+  if [ $target = "package" ]; then
+    if test -d ./op_kernel/binary ; then
+      ./cust*.run
+      if [ $? -ne 0 ]; then exit 1; fi
+      cmake --build . --target binary -j16
+      if [ $? -ne 0 ]; then exit 1; fi
+      cmake --build . --target $target -j16
+    fi
+  fi
+}
+
+if [[ $opts =~ $ENABLE_CROSS ]] && [[ $opts =~ $ENABLE_BINARY ]]
+then
+  target=package
+  if [ "$1"x != ""x ]; then target=$1; fi
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake .. $opts -DENABLE_CROSS_COMPILE=0
+  else
+    cmake .. --preset=default -DENABLE_CROSS_COMPILE=0
+  fi
+  cmake_run_package $target
+  cp -r kernel ../
+  rm -rf *
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake .. $opts
+  else
+    cmake .. --preset=default
+  fi
+
+  cmake --build . --target $target -j16
+  if [ $? -ne 0 ]; then exit 1; fi
+  if [ $target = "package" ]; then
+    if test -d ./op_kernel/binary ; then
+      ./cust*.run
+    fi
+  fi
+  rm -rf ../kernel
+
+else
+  target=package
+  if [ "$1"x != ""x ]; then target=$1; fi
+  if [ "$cmake_version" \< "3.19.0" ] ; then
+    cmake .. $opts
+  else
+      cmake .. --preset=default
+  fi
+  cmake_run_package $target
+fi
+
+
+# for debug
+# cd build_out
+# make
+# cpack
+# verbose append -v
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..886119daadd85495676c07dfb0b629e3deab8ccf
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake
@@ -0,0 +1,25 @@
+
+set(CMAKE_CXX_FLAGS_DEBUG "")
+set(CMAKE_CXX_FLAGS_RELEASE "")
+
+if (NOT DEFINED vendor_name)
+    set(vendor_name customize CACHE STRING "")
+endif()
+if (NOT DEFINED ASCEND_CANN_PACKAGE_PATH)
+    set(ASCEND_CANN_PACKAGE_PATH /usr/local/Ascend/latest CACHE PATH "")
+endif()
+if (NOT DEFINED ASCEND_PYTHON_EXECUTABLE)
+    set(ASCEND_PYTHON_EXECUTABLE python3 CACHE STRING "")
+endif()
+if (NOT DEFINED ASCEND_COMPUTE_UNIT)
+    message(FATAL_ERROR "ASCEND_COMPUTE_UNIT not set in CMakePreset.json ! 
+")
+endif()
+set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler)
+set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin)
+set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen)
+set(ASCEND_FRAMEWORK_TYPE tensorflow)
+file(MAKE_DIRECTORY ${ASCEND_AUTOGEN_PATH})
+set(CUSTOM_COMPILE_OPTIONS "custom_compile_options.ini")
+execute_process(COMMAND rm -rf ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS}
+                COMMAND touch ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS})
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4179dfd25b41487d5aaf1ac95459543e26ab4fff
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake
@@ -0,0 +1,192 @@
+
+function(get_system_info SYSTEM_INFO)
+  if (UNIX)
+    execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP)
+    string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP})
+    set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE)
+  elseif (WIN32)
+    message(STATUS "System is Windows. Only for pre-build.")
+  else ()
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.")
+  endif ()
+endfunction()
+
+function(opbuild)
+  message(STATUS "Opbuild generating sources")
+  cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX" "OPS_SRC" ${ARGN})
+  execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++11 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0
+                  -I ${ASCEND_CANN_PACKAGE_PATH}/include -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api
+                  -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("build ops lib info: ${EXEC_INFO}")
+    message("build ops lib error: ${EXEC_ERROR}")
+    message(FATAL_ERROR "opbuild run failed!")
+  endif()
+  set(proj_env "")
+  set(prefix_env "")
+  if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x")
+    set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}")
+  endif()
+  if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x")
+    set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}")
+  endif()
+  execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+                          ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("opbuild ops info: ${EXEC_INFO}")
+    message("opbuild ops error: ${EXEC_ERROR}")
+  endif()
+  message(STATUS "Opbuild generating sources - done")
+endfunction()
+
+function(add_ops_info_target)
+  cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY)
+  add_custom_command(OUTPUT ${OPINFO_OUTPUT}
+      COMMAND mkdir -p ${opinfo_file_path}
+      COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py
+              ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT}
+  )
+  add_custom_target(${OPINFO_TARGET} ALL
+      DEPENDS ${OPINFO_OUTPUT}
+  )
+  install(FILES ${OPINFO_OUTPUT}
+          DESTINATION ${OPINFO_INSTALL_DIR}
+  )
+endfunction()
+
+function(add_ops_compile_options OP_TYPE)
+  cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN})
+  file(APPEND ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS}
+       "${OP_TYPE},${OP_COMPILE_COMPUTE_UNIT},${OP_COMPILE_OPTIONS}\n")
+endfunction()
+
+function(add_ops_impl_target)
+  cmake_parse_arguments(OPIMPL "" "TARGET;OPS_INFO;IMPL_DIR;OUT_DIR;INSTALL_DIR" "OPS_BATCH;OPS_ITERATE" ${ARGN})
+  add_custom_command(OUTPUT ${OPIMPL_OUT_DIR}/.impl_timestamp
+      COMMAND mkdir -m 700 -p ${OPIMPL_OUT_DIR}/dynamic
+      COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_impl_build.py
+              ${OPIMPL_OPS_INFO}
+              \"${OPIMPL_OPS_BATCH}\" \"${OPIMPL_OPS_ITERATE}\"
+              ${OPIMPL_IMPL_DIR}
+              ${OPIMPL_OUT_DIR}/dynamic
+              ${ASCEND_AUTOGEN_PATH}
+
+      COMMAND rm -rf ${OPIMPL_OUT_DIR}/.impl_timestamp
+      COMMAND touch ${OPIMPL_OUT_DIR}/.impl_timestamp
+      DEPENDS ${OPIMPL_OPS_INFO}
+              ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_impl_build.py
+  )
+  add_custom_target(${OPIMPL_TARGET} ALL
+      DEPENDS ${OPIMPL_OUT_DIR}/.impl_timestamp)
+  if (${ENABLE_SOURCE_PACKAGE})
+    install(DIRECTORY ${OPIMPL_OUT_DIR}/dynamic
+        DESTINATION ${OPIMPL_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_npu_support_target)
+  cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY)
+  add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+    COMMAND mkdir -p ${NPUSUP_OUT_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh
+            ${NPUSUP_OPS_INFO_DIR}
+            ${NPUSUP_OUT_DIR}
+  )
+  add_custom_target(npu_supported_ops ALL
+    DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+  )
+  install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+    DESTINATION ${NPUSUP_INSTALL_DIR}
+  )
+endfunction()
+
+function(add_bin_compile_target)
+  cmake_parse_arguments(BINCMP "" "TARGET;OPS_INFO;COMPUTE_UNIT;IMPL_DIR;ADP_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+  file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/src)
+  file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/bin)
+  file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/gen)
+  execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_bin_param_build.py
+                          ${BINCMP_OPS_INFO} ${BINCMP_OUT_DIR}/gen ${BINCMP_COMPUTE_UNIT}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("ops binary compile scripts gen info: ${EXEC_INFO}")
+    message("ops binary compile scripts gen error: ${EXEC_ERROR}")
+    message(FATAL_ERROR "ops binary compile scripts gen failed!")
+  endif()
+  if (NOT TARGET binary)
+    add_custom_target(binary)
+  endif()
+  add_custom_target(${BINCMP_TARGET}
+                    COMMAND cp -r ${BINCMP_IMPL_DIR}/*.* ${BINCMP_OUT_DIR}/src
+  )
+  add_custom_target(${BINCMP_TARGET}_gen_ops_config
+                    COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p ${BINCMP_OUT_DIR}/bin
+                    COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p ${BINCMP_OUT_DIR}/bin
+                            -s ${BINCMP_COMPUTE_UNIT}
+  )
+  add_dependencies(binary ${BINCMP_TARGET}_gen_ops_config)
+  file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*.sh)
+  foreach(bin_script ${bin_scripts})
+    get_filename_component(bin_file ${bin_script} NAME_WE)
+    string(REPLACE "-" ";" bin_sep ${bin_file})
+    list(GET bin_sep 0 op_type)
+    list(GET bin_sep 1 op_file)
+    list(GET bin_sep 2 op_index)
+    if (NOT TARGET ${BINCMP_TARGET}_${op_file}_copy)
+      file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file})
+      add_custom_target(${BINCMP_TARGET}_${op_file}_copy
+                        COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py ${BINCMP_OUT_DIR}/src/${op_type}.py
+      )
+      install(DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file}
+        DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT} OPTIONAL
+      )
+      install(FILES ${BINCMP_OUT_DIR}/bin/${op_file}.json
+        DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}/ OPTIONAL
+      )
+    endif()
+    add_custom_target(${BINCMP_TARGET}_${op_file}_${op_index}
+                      COMMAND export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && bash ${bin_script} ${BINCMP_OUT_DIR}/src/${op_type}.py ${BINCMP_OUT_DIR}/bin/${op_file} && echo $(MAKE)
+                      WORKING_DIRECTORY ${BINCMP_OUT_DIR}
+    )
+    add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET} ${BINCMP_TARGET}_${op_file}_copy)
+    add_dependencies(${BINCMP_TARGET}_gen_ops_config ${BINCMP_TARGET}_${op_file}_${op_index})
+  endforeach()
+  install(FILES ${BINCMP_OUT_DIR}/bin/binary_info_config.json
+    DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} OPTIONAL
+  )
+
+  install(DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file}
+    DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/${BINCMP_COMPUTE_UNIT} OPTIONAL
+  )
+  install(FILES ${BINCMP_OUT_DIR}/bin/binary_info_config.json
+    DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/config/${BINCMP_COMPUTE_UNIT} OPTIONAL
+  )
+  install(FILES ${BINCMP_OUT_DIR}/bin/${op_file}.json
+    DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/config/${BINCMP_COMPUTE_UNIT} OPTIONAL
+  )
+
+endfunction()
+
+function(add_cross_compile_target)
+    cmake_parse_arguments(CROSSMP "" "TARGET;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+    add_custom_target(${CROSSMP_TARGET} ALL
+                        DEPENDS ${CROSSMP_OUT_DIR}
+    )
+    install(DIRECTORY ${CROSSMP_OUT_DIR}
+            DESTINATION ${CROSSMP_INSTALL_DIR}
+    )
+endfunction()
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2f362c396622d66132f80f54492a8cc3204882fb
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake
@@ -0,0 +1,26 @@
+
+add_library(intf_pub INTERFACE)
+target_compile_options(intf_pub INTERFACE
+    -fPIC
+    -fvisibility=hidden
+    -fvisibility-inlines-hidden
+    $<$<CONFIG:Release>:-O2>
+    $<$<CONFIG:Debug>:-O0 -g>
+    $<$<COMPILE_LANGUAGE:CXX>:-std=c++11>
+    $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:Debug>>:-ftrapv -fstack-check>
+    $<$<COMPILE_LANGUAGE:C>:-pthread -Wfloat-equal -Wshadow -Wformat=2 -Wno-deprecated -Wextra>
+    $<IF:$<VERSION_GREATER:${CMAKE_C_COMPILER_VERSION},4.8.5>,-fstack-protector-strong,-fstack-protector-all>
+)
+target_compile_definitions(intf_pub INTERFACE
+    _GLIBCXX_USE_CXX11_ABI=0
+    $<$<CONFIG:Release>:_FORTIFY_SOURCE=2>
+)
+target_include_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/include)
+target_link_options(intf_pub INTERFACE
+    $<$<STREQUAL:$<TARGET_PROPERTY:TYPE>,EXECUTABLE>:-pie>
+    $<$<CONFIG:Release>:-s>
+    -Wl,-z,relro
+    -Wl,-z,now
+    -Wl,-z,noexecstack
+)
+target_link_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/lib64)
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..48c565bfb4f2edc6534a81abaa8565c4cf2dfc30
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake
@@ -0,0 +1,17 @@
+execute_process(COMMAND chmod +x ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself.sh)
+execute_process(COMMAND ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself.sh
+                        --header ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself-header.sh
+                        --help-header ./help.info
+                        --gzip --complevel 4 --nomd5 --sha256
+                        ./ ${CPACK_PACKAGE_FILE_NAME} "version:1.0" ./install.sh
+                WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+                RESULT_VARIABLE EXEC_RESULT
+                ERROR_VARIABLE  EXEC_ERROR
+)
+if (NOT "${EXEC_RESULT}x" STREQUAL "0x")
+  message(FATAL_ERROR "CPack Command error: ${EXEC_RESULT}\n${EXEC_ERROR}")
+endif()
+execute_process(COMMAND cp ${CPACK_EXTERNAL_BUILT_PACKAGES} ${CPACK_PACKAGE_DIRECTORY}/
+                COMMAND echo "Copy ${CPACK_EXTERNAL_BUILT_PACKAGES} to ${CPACK_PACKAGE_DIRECTORY}/"
+                WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY}
+)
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..40dd51cfac524b0a9607b7d8b2813edd2210c509
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt
@@ -0,0 +1,82 @@
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                      cust_opsproto_rt2.0
+)
+add_library(cust_optiling SHARED ${ops_srcs})
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+                      cust_opmaster_rt2.0
+)
+
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+add_library(cust_opapi SHARED ${aclnn_src})
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+
+add_custom_target(optiling_compat ALL
+                  COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                          ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+
+install(TARGETS cust_op_proto
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+install(TARGETS cust_optiling
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+install(TARGETS cust_opapi
+        LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+install(FILES ${aclnn_inc}
+        DESTINATION packages/vendors/${vendor_name}/op_api/include)
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5de999d7cc70620900f7b6ae3850695e1af26e8c
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "welford_update_custom_tiling.h"
+#include "register/op_def_registry.h"
+
+namespace optiling {
+constexpr uint32_t BLOCK_DIM = 1;
+constexpr bool ISINPLACE = true;
+constexpr uint8_t RN_SIZE = 1;
+constexpr uint32_t AB_SIZE = 64;
+constexpr uint32_t AB_LENGTH = 35;
+constexpr float NREC = 1.0 / 8;
+
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    WelfordUpdateCustomTilingData tiling;
+    ComputeTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC, tiling);
+
+    context->SetBlockDim(BLOCK_DIM);
+    context->SetTilingKey(1);
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    return ge::GRAPH_SUCCESS;
+}
+}
+
+namespace ge {
+static ge::graphStatus InferShape(gert::InferShapeContext *context)
+{
+    const gert::Shape *x1_shape = context->GetInputShape(0);
+    gert::Shape *y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+}
+
+namespace ops {
+class WelfordUpdateCustom : public OpDef {
+public:
+    explicit WelfordUpdateCustom(const char *name) : OpDef(name)
+    {
+        this->Input("srcGm")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT16 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Input("inMeanGm")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Input("inVarGm")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+
+        this->Output("outMeanGm")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Output("outVarGm")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+
+        this->SetInferShape(ge::InferShape);
+        this->AICore().SetTiling(optiling::TilingFunc);
+        this->AICore().AddConfig("ascend910b");
+        this->AICore().AddConfig("ascend310p");
+    }
+};
+
+OP_ADD(WelfordUpdateCustom);
+}
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef4335a3f3c62e6f66b142877586f84db95a22fc
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
+#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(WelfordUpdateCustomTilingData)
+    TILING_DATA_FIELD_DEF(bool, inplace);
+    TILING_DATA_FIELD_DEF(uint32_t, nLength);
+    TILING_DATA_FIELD_DEF(uint32_t, rLength);
+    TILING_DATA_FIELD_DEF(uint32_t, abComputeLength);
+    TILING_DATA_FIELD_DEF(float, nRec);
+    TILING_DATA_FIELD_DEF(uint32_t, tmpLocalSize);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(WelfordUpdateCustom, WelfordUpdateCustomTilingData)
+} // namespace optiling
+
+constexpr bool ISREUSESOURCE = false;
+constexpr bool ISINPLACE = true;
+
+void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength,
+    float nRec, optiling::WelfordUpdateCustomTilingData &tiling)
+{
+    std::vector<int64_t> shapeVec = {nLength, rLength};
+    ge::Shape srcShape(shapeVec);
+    uint32_t maxsize = 0;
+    uint32_t minsize = 0;
+    uint32_t dtypesizeT = 2;  // half类型
+    uint32_t dtypesizeU = 4;  // float类型
+
+    tiling.set_inplace(inplace);
+    tiling.set_nLength(nLength);
+    tiling.set_rLength(rLength);
+    tiling.set_abComputeLength(abComputeLength);
+    tiling.set_nRec(nRec);
+
+    AscendC::GetWelfordUpdateMaxMinTmpSize(srcShape, dtypesizeT, dtypesizeU, ISREUSESOURCE, ISINPLACE, maxsize,
+        minsize);
+    tiling.set_tmpLocalSize(minsize);
+}
+
+#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c50a409a20bd0e0cce495824295a18799e4f8be1
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt
@@ -0,0 +1,69 @@
+# set custom compile options
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+add_ops_compile_options(ALL OPTIONS -mllvm -cce-aicore-jump-expand=true)
+
+foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+        OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # generate ascendc impl py once
+    if (NOT TARGET ascendc_impl_gen)
+        add_ops_impl_target(TARGET ascendc_impl_gen
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
+        )
+    endif()
+
+    # dynamic shape binary compile
+    if (${ENABLE_BINARY_PACKAGE} AND NOT ${ENABLE_CROSS_COMPILE})
+        add_bin_compile_target(TARGET ascendc_bin_${compute_unit}
+            OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+            IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR}
+            ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic
+            OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+            COMPUTE_UNIT ${compute_unit}
+        )
+        add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen)
+    endif()
+
+    if (${ENABLE_CROSS_COMPILE} AND ${ENABLE_BINARY_PACKAGE})
+        add_cross_compile_target(
+            TARGET bin_${compute_unit}
+            OUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../kernel
+            INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/
+        )
+    endif()
+endforeach()
+
+# generate npu_supported_ops.json
+add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+)
+
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# install kernel file
+if (${ENABLE_SOURCE_PACKAGE})
+    file(GLOB KERNEL_FILES
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/*.py
+    )
+    install(FILES ${KERNEL_FILES}
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+    )
+endif()
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0fdc799bc4e3c499ccc418b8bbe79bb94f67c99
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp
@@ -0,0 +1,23 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../../../../../kernel_impl/welford_update_custom.h"
+
+extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm,
+    GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(tilingData, tiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    if (TILING_KEY_IS(1)) {
+        MyCustomKernel::KernelWelfordUpdate<half, float, true> op;
+        op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, vecTiling);
+        op.Process();
+    }
+}
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65af46cbbeeeb50bf8f8c9e0a727761512c7a31
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+RN_SIZE = 1
+AB_SIZE = 64
+AB_LENGTH = 35
+NREC = 1.0 / 8
+
+def gen_golden_data_simple():
+    x1 = np.random.uniform(1, 100, [RN_SIZE * AB_SIZE]).astype(np.float16)
+    x2 = np.random.uniform(-60000, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32)
+    x3 = np.random.uniform(0, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32)
+    golden1 = x2.copy()
+    golden2 = x3.copy()
+
+    for i in range(AB_LENGTH):
+        n = np.float32(NREC)
+        golden1[i] = x2[i] + (x1[i] - x2[i]) * n
+        golden2[i] = x3[i] + (x1[i] - x2[i]) * (x1[i] - golden1[i])
+
+    os.system("mkdir -p ./input")
+    x1.tofile("./input/input_srcGm.bin")
+    x2.tofile("./input/input_inMeanGm.bin")
+    x3.tofile("./input/input_inVarGm.bin")
+    os.system("mkdir -p ./output")
+    golden1.tofile("./output/golden_outMeanGm.bin")
+    golden2.tofile("./output/golden_outVarGm.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info
new file mode 100644
index 0000000000000000000000000000000000000000..f4b28d57a8150f0df6c386473b7554c7d087c90f
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info
@@ -0,0 +1 @@
+  --install-path                    Install operator package to specific dir path
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8468c5a256f2c77fad5bf78ab108ca5b62aad672
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+vendor_name=customize
+targetdir=/usr/local/Ascend/opp
+target_custom=0
+
+sourcedir=$PWD/packages
+vendordir=vendors/$vendor_name
+
+QUIET="y"
+
+while true
+do
+    case $1 in
+    --quiet)
+        QUIET="y"
+        shift
+    ;;
+    --install-path=*)
+        INSTALL_PATH=$(echo $1 | cut -d"=" -f2-)
+        INSTALL_PATH=${INSTALL_PATH%*/}
+        shift
+    ;;
+    --*)
+        shift
+    ;;
+    *)
+        break
+    ;;
+    esac
+done
+
+log() {
+    cur_date=`date +"%Y-%m-%d %H:%M:%S"`
+    echo "[runtime] [$cur_date] "$1
+}
+
+if [ -n "${INSTALL_PATH}" ]; then
+    if [[ ! "${INSTALL_PATH}" = /* ]]; then
+        log "[ERROR] use absolute path for --install-path argument"
+        exit 1
+    fi
+    if [ ! -d ${INSTALL_PATH} ]; then
+        mkdir ${INSTALL_PATH} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${INSTALL_PATH}  failed"
+            exit 1
+        fi
+    fi
+    targetdir=${INSTALL_PATH}
+elif [ -n "${ASCEND_CUSTOM_OPP_PATH}" ]; then
+    if [ ! -d ${ASCEND_CUSTOM_OPP_PATH} ]; then
+        mkdir -p ${ASCEND_CUSTOM_OPP_PATH} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${ASCEND_CUSTOM_OPP_PATH}  failed"
+        fi
+    fi
+    targetdir=${ASCEND_CUSTOM_OPP_PATH}
+else
+    if [ "x${ASCEND_OPP_PATH}" == "x" ]; then
+        log "[ERROR] env ASCEND_OPP_PATH no exist"
+        exit 1
+    fi
+    targetdir="${ASCEND_OPP_PATH}"
+fi
+
+if [ ! -d $targetdir ];then
+    log "[ERROR] $targetdir no exist"
+    exit 1
+fi
+
+upgrade()
+{
+    if [ ! -d ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 files"
+        return 0
+    fi
+
+    if [ ! -d ${targetdir}/$vendordir/$1 ];then
+        log "[INFO] create ${targetdir}/$vendordir/$1."
+        mkdir -p ${targetdir}/$vendordir/$1
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/$1 failed"
+            return 1
+        fi
+    else
+        has_same_file=-1
+        for file_a in ${sourcedir}/$vendordir/$1/*; do
+            file_b=${file_a##*/};
+            if [ "ls ${targetdir}/$vendordir/$1" = "" ]; then
+                log "[INFO] ${targetdir}/$vendordir/$1 is empty !!"
+		        return 1
+	          fi
+            grep -q $file_b <<<`ls ${targetdir}/$vendordir/$1`;
+            if [[ $? -eq 0 ]]; then
+                echo -n "${file_b} "
+                has_same_file=0
+            fi
+        done
+        if [ 0 -eq $has_same_file ]; then
+            if test $QUIET = "n"; then
+                echo "[INFO]: has old version in ${targetdir}/$vendordir/$1, \
+                you want to Overlay Installation , please enter:[o]; \
+                or replace directory installation , please enter: [r]; \
+                or not install , please enter:[n]."
+
+                while true
+                do
+                    read orn
+                    if [ "$orn" = n ]; then
+                        return 0
+                    elif [ "$orn" = m ]; then
+                        break;
+                    elif [ "$0rn" = r ]; then
+                        [ -n "${targetdir}/$vendordir/$1/" ] && rm -rf "${targetdir}/$vendordir/$1"/*
+                        break;
+                    else
+                        echo "[ERROR] input error, please input again!"
+                    fi
+                done
+            fi
+        fi
+        log "[INFO] replace or merge old ops $1 files .g....."
+    fi
+
+    log "copy new ops $1 files ......"
+    if [ -d ${targetdir}/$vendordir/$1/ ]; then
+        chmod -R +w "$targetdir/$vendordir/$1/" >/dev/null 2>&1
+    fi
+    cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 files failed"
+        return 1
+    fi
+
+    return 0
+}
+upgrade_proto()
+{
+    if [ ! -f ${sourcedir}/$vendordir/custom.proto ]; then
+        log "[INFO] no need to upgrade custom.proto files"
+        return 0
+    fi
+    if [ ! -d ${targetdir}/$vendordir/framework/caffe ];then
+        log "[INFO] create ${targetdir}/$vendordir/framework/caffe."
+        mkdir -p ${targetdir}/$vendordir/framework/caffe
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/framework/caffe failed"
+            return 1
+        fi
+    else
+        if [ -f ${targetdir}/$vendordir/framework/caffe/custom.proto ]; then
+            # 有老版本,判断是否要覆盖式安装
+            if test $QUIET = "n"; then
+                  echo "[INFO] ${targetdir}/$vendordir/framework/caffe has old version"\
+                "custom.proto file. Do you want to replace? [y/n] "
+
+                while true
+                do
+                    read yn
+                    if [ "$yn" = n ]; then
+                        return 0
+                    elif [ "$yn" = y ]; then
+                        break;
+                    else
+                        echo "[ERROR] input error, please input again!"
+                    fi
+                done
+            fi
+        fi
+        log "[INFO] replace old caffe.proto files ......"
+    fi
+    chmod -R +w "$targetdir/$vendordir/framework/caffe/" >/dev/null 2>&1
+    cp -rf ${sourcedir}/$vendordir/custom.proto ${targetdir}/$vendordir/framework/caffe/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new custom.proto failed"
+        return 1
+    fi
+	log "[INFO] copy custom.proto success"
+
+    return 0
+}
+
+upgrade_file()
+{
+    if [ ! -e ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 file"
+        return 0
+    fi
+
+    log "copy new $1 files ......"
+    cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 file failed"
+        return 1
+    fi
+
+    return 0
+}
+
+delete_optiling_file()
+{
+  if [ ! -d ${targetdir}/vendors ];then
+    log "[INFO] $1 not exist, no need to uninstall"
+    return 0
+  fi
+  sys_info=$(uname -m)
+  if [ ! -d ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/lib/linux/${sys_info} ];then
+    rm -rf ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/liboptiling.so
+  fi
+  return 0
+}
+
+log "[INFO] copy uninstall sh success"
+
+if [ ! -d ${targetdir}/vendors ];then
+        log "[INFO] create ${targetdir}/vendors."
+        mkdir -p ${targetdir}/vendors
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/vendors failed"
+            return 1
+        fi
+fi
+chmod u+w ${targetdir}/vendors
+
+echo "[ops_custom]upgrade framework"
+upgrade framework
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op proto"
+upgrade op_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade version.info"
+upgrade_file version.info
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op impl"
+delete_optiling_file op_impl
+upgrade op_impl
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op api"
+upgrade op_api
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+upgrade_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+# set the set_env.bash
+if [ -n "${INSTALL_PATH}" ] && [ -d ${INSTALL_PATH} ]; then
+    _ASCEND_CUSTOM_OPP_PATH=${targetdir}/${vendordir}
+    bin_path="${_ASCEND_CUSTOM_OPP_PATH}/bin"
+    set_env_variable="#!/bin/bash\nexport ASCEND_CUSTOM_OPP_PATH=${_ASCEND_CUSTOM_OPP_PATH}:\${ASCEND_CUSTOM_OPP_PATH}"
+    if [ ! -d ${bin_path} ]; then
+        mkdir -p ${bin_path} >> /dev/null 2>&1
+        if [ $? -ne 0 ]; then
+            log "[ERROR] create ${bin_path} failed"
+            exit 1
+        fi
+    fi
+    echo -e ${set_env_variable} > ${bin_path}/set_env.bash
+    if [ $? -ne 0 ]; then
+        log "[ERROR] write ASCEND_CUSTOM_OPP_PATH to set_env.bash failed"
+        exit 1
+    else
+        log "[INFO] using requirements: when custom module install finished or before you run the custom module, \
+        execute the command [ source ${bin_path}/set_env.bash ] to set the environment path"
+    fi
+else
+    config_file=${targetdir}/vendors/config.ini
+    if [ ! -f ${config_file} ]; then
+        touch ${config_file}
+        chmod 640 ${config_file}
+        echo "load_priority=$vendor_name" > ${config_file}
+        if [ $? -ne 0 ];then
+            echo "echo load_priority failed"
+            exit 1
+        fi
+    else
+        found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)"
+        found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ')
+        vendor=$(echo $found_vendor | tr -s ' ' ',')
+        if [ "$vendor" != "" ]; then
+            sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file"
+        fi
+    fi
+fi
+
+chmod u-w ${targetdir}/vendors
+
+if [ -d ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/ ]; then
+    chmod -R 440 ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/* >/dev/null 2>&1
+fi
+if [ -f ${targetdir}/ascend_install.info ]; then
+    chmod -R 440 ${targetdir}/ascend_install.info
+fi
+if [ -f ${targetdir}/scene.info ]; then
+    chmod -R 440 ${targetdir}/scene.info
+fi
+if [ -f ${targetdir}/version.info ]; then
+    chmod -R 440 ${targetdir}/version.info
+fi
+
+echo "SUCCESS"
+exit 0
+
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e091734858534a6aa10bb5204b87302438004926
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+vendor_name=customize
+targetdir=/usr/local/Ascend/opp
+target_custom=0
+
+sourcedir=$PWD/packages
+vendordir=vendors/$vendor_name
+
+log() {
+    cur_date=`date +"%Y-%m-%d %H:%M:%S"`
+    echo "[runtime] [$cur_date] "$1
+}
+
+if [[ "x${ASCEND_OPP_PATH}" == "x" ]];then
+    log "[ERROR] env ASCEND_OPP_PATH no exist"
+    exit 1
+fi
+
+targetdir=${ASCEND_OPP_PATH}
+
+if [ ! -d $targetdir ];then
+    log "[ERROR] $targetdir no exist"
+    exit 1
+fi
+
+upgrade()
+{
+    if [ ! -d ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 files"
+        return 0
+    fi
+
+    if [ ! -d ${targetdir}/$vendordir/$1 ];then
+        log "[INFO] create ${targetdir}/$vendordir/$1."
+        mkdir -p ${targetdir}/$vendordir/$1
+        if [ $? -ne 0 ];then
+            log "[ERROR] create ${targetdir}/$vendordir/$1 failed"
+            return 1
+        fi
+    else
+        vendor_installed_dir=$(ls "$targetdir/vendors" 2> /dev/null)
+        for i in $vendor_installed_dir;do
+            vendor_installed_file=$(ls "$vendor_installed_dir/$vendor_name/$i" 2> /dev/null)
+            if [ "$i" = "$vendor_name" ] && [ "$vendor_installed_file" != "" ]; then
+                echo "[INFO]: $vendor_name custom opp package has been installed on the path $vendor_installed_dir, \
+                you want to Overlay Installation , please enter:[o]; \
+                or replace directory installation , please enter: [r]; \
+                or not install , please enter:[n]."
+            fi
+	          while true
+            do
+                read mrn
+                if [ "$mrn" = m ]; then
+                    break
+                elif [ "$mrn" = r ]; then
+                    [ -n "$vendor_installed_file"] && rm -rf "$vendor_installed_file"
+                    break
+                elif [ "$mrn" = n ]; then
+                    return 0
+                else
+                    echo "[WARNING]: Input error, please input m or r or n to choose!"
+                fi
+            done
+        done
+        log "[INFO] replace old ops $1 files ......"
+    fi
+
+    log "copy new ops $1 files ......"
+    cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 files failed"
+        return 1
+    fi
+
+    return 0
+}
+
+upgrade_file()
+{
+    if [ ! -e ${sourcedir}/$vendordir/$1 ]; then
+        log "[INFO] no need to upgrade ops $1 file"
+        return 0
+    fi
+
+    log "copy new $1 files ......"
+    cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1
+    if [ $? -ne 0 ];then
+        log "[ERROR] copy new $1 file failed"
+        return 1
+    fi
+
+    return 0
+}
+
+log "[INFO] copy uninstall sh success"
+
+echo "[ops_custom]upgrade framework"
+upgrade framework
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op proto"
+upgrade op_proto
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op impl"
+upgrade op_impl
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade op api"
+upgrade op_api
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "[ops_custom]upgrade version.info"
+upgrade_file version.info
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+config_file=${targetdir}/vendors/config.ini
+found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)"
+found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ')
+vendor=$(echo $found_vendor | tr -s ' ' ',')
+if [ "$vendor" != "" ]; then
+    sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file"
+fi
+
+changemode()
+{
+    if [ -d ${targetdir} ];then
+        chmod -R 550 ${targetdir}>/dev/null 2>&1
+    fi
+
+    return 0
+}
+echo "[ops_custom]changemode..."
+#changemode
+if [ $? -ne 0 ];then
+    exit 1
+fi
+
+echo "SUCCESS"
+exit 0
+
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8d2d11c90ac38d0774fa9db32716a6c1e22bd3b5
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt
@@ -0,0 +1,2 @@
+include(cmake/fun.cmake)
+add_subdirectory(npu)
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..024e26303a128ee4d8edb90b1d8a735a9851f4d7
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake
@@ -0,0 +1,53 @@
+
+set(UPER_CHARS A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)
+function(string_to_snake str_in snake_out)
+  set(str_cam ${str_in})
+  foreach(uper_char ${UPER_CHARS})
+    string(TOLOWER "${uper_char}" lower_char)
+    string(REPLACE ${uper_char} "_${lower_char}" str_cam ${str_cam})
+  endforeach()
+  string(SUBSTRING ${str_cam} 1 -1 str_cam)
+  set(${snake_out} "${str_cam}" PARENT_SCOPE)
+endfunction()
+
+function(add_cpu_target)
+  cmake_parse_arguments(CPU_TEST "" "OP" "SRC" ${ARGN})
+  string_to_snake("${CPU_TEST_OP}" op_snake)
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h
+                     COMMAND python3 ${CMAKE_SOURCE_DIR}/cmake/util/tiling_data_def_build.py
+                             ${CMAKE_SOURCE_DIR}/op_host/${op_snake}_tiling.h
+                             ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h
+                     DEPENDS ${CMAKE_SOURCE_DIR}/op_host/${op_snake}_tiling.h
+  )
+  add_custom_target(gen_${op_snake}_tiling_header
+                    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h
+  )
+  
+  add_executable(${op_snake}_cpu ${CPU_TEST_SRC})
+  add_dependencies(${op_snake}_cpu gen_${op_snake}_tiling_header)
+  target_compile_options(${op_snake}_cpu PRIVATE -g -include ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h)
+  target_link_libraries(${op_snake}_cpu PRIVATE tikicpulib::ascend910B1)
+  set_target_properties(${op_snake}_cpu PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+endfunction()
+
+function(add_npu_target)
+  cmake_parse_arguments(NPU_TEST "" "OP" "SRC" ${ARGN})
+  string_to_snake("${NPU_TEST_OP}" op_snake)
+  add_executable(${op_snake}_npu ${NPU_TEST_SRC})
+  target_compile_options(${op_snake}_npu PRIVATE -g)
+  target_include_directories(${op_snake}_npu PRIVATE 
+      ${ASCEND_CANN_PACKAGE_PATH}/include/acl
+      ${ASCEND_AUTOGEN_PATH}
+  )
+  target_link_libraries(${op_snake}_npu PRIVATE
+      intf_pub
+      cust_opapi
+      ascendcl
+      nnopbase
+  )
+  set_target_properties(${op_snake}_npu PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
+  )
+endfunction()
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6ea029137762737c155a94fd918508eb66cbab36
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_npu_target(OP WelfordUpdateCustom SRC welford_update_custom_main.cpp)
+
+add_custom_target(run_npu_test
+		COMMAND echo "==============================================================================="
+		COMMAND echo " Run NPU test at ${CMAKE_CURRENT_BINARY_DIR}"
+		COMMAND echo "==============================================================================="
+		COMMAND $<TARGET_FILE:welford_update_custom_npu>
+		COMMAND echo "==============================================================================="
+		)
+add_dependencies(run_npu_test welford_update_custom_npu)
\ No newline at end of file
diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7ab1b3d528895d9c766d71cfb7ce46b77d21d87
--- /dev/null
+++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp
@@ -0,0 +1,215 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include "acl/acl_rt.h"
+#include "acl/acl.h"
+#include "aclnn_welford_update_custom.h"
+#include "../../../../../common/data_utils.h"
+
+constexpr uint8_t SRC_SIZE = 3;
+constexpr uint16_t TIMEOUT = 5000;
+constexpr uint8_t INDEX_IN_VAR = 2;
+constexpr uint8_t INDEX_OUT_MEAN = 3;
+constexpr uint8_t INDEX_OUT_VAR = 4;
+constexpr uint8_t RN_SIZE = 1;
+constexpr uint32_t AB_SIZE = 64;
+
+aclrtStream CreateStream(int device)
+{
+    if (aclInit(NULL) != ACL_SUCCESS) {
+        printf("acl init failed\n");
+        return NULL;
+    }
+    if (aclrtSetDevice(device) != ACL_SUCCESS) {
+        printf("Set device failed\n");
+        (void)aclFinalize();
+        return NULL;
+    }
+    aclrtStream stream = nullptr;
+    if (aclrtCreateStream(&stream) != ACL_SUCCESS) {
+        printf("Create stream failed\n");
+        return NULL;
+    }
+    return stream;
+}
+
+void DestroyStream(aclrtStream stream, int device)
+{
+    (void)aclrtDestroyStream(stream);
+    if (aclrtResetDevice(device) != ACL_SUCCESS) {
+        printf("Reset device failed\n");
+    }
+    if (aclFinalize() != ACL_SUCCESS) {
+        printf("Finalize acl failed\n");
+    }
+}
+
+struct tensorInfo {
+    int64_t *dims;
+    int64_t dimCnt;
+    aclDataType dtype;
+    aclFormat fmt;
+};
+
+int64_t GetDataSize(struct tensorInfo *desc)
+{
+    if (!desc->dims) {
+        return 0;
+    }
+    int64_t size = 1;
+    for (auto i = 0; i < desc->dimCnt; i++) {
+        size *= desc->dims[i];
+    }
+    return size * sizeof(float);
+}
+
+static bool CompareResult(const void *outputData, int64_t outSize, std::string goldenName)
+{
+    void *goldenData;
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden_" + goldenName + ".bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden_%s.bin success!\n", goldenName.c_str());
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden_output_%s.bin failed output is %lf, golden is %lf\n", goldenName.c_str(), a,
+            b);
+            wrongNum++;
+        }
+    }
+    CHECK_ACL(aclrtFreeHost(goldenData));
+
+    if (wrongNum != 0) {
+        return false;
+    } else {
+        printf("CompareResult golden_output_%s.bin success\n", goldenName.c_str());
+        return true;
+    }
+}
+
+int main(void)
+{
+    aclrtStream stream;
+
+    int64_t srcGm[] = {RN_SIZE * AB_SIZE};
+    int64_t inMeanGm[] = {RN_SIZE * AB_SIZE};
+    int64_t inVarGm[] = {RN_SIZE * AB_SIZE};
+    int64_t outMeanGm[] = {RN_SIZE * AB_SIZE};
+    int64_t outVarGm[] = {RN_SIZE * AB_SIZE};
+    struct tensorInfo tensorDesc[] = {{srcGm, 1, ACL_FLOAT16, ACL_FORMAT_ND},
+                                        {inMeanGm, 1, ACL_FLOAT, ACL_FORMAT_ND},
+                                        {inVarGm, 1, ACL_FLOAT, ACL_FORMAT_ND},
+                                        {outMeanGm, 1, ACL_FLOAT, ACL_FORMAT_ND},
+                                        {outVarGm, 1, ACL_FLOAT, ACL_FORMAT_ND},
+                                        };
+
+    std::string ParamNames[] = {
+        "srcGm",
+        "inMeanGm",
+        "inVarGm",
+        "outMeanGm",
+        "outVarGm",
+    };
+    stream = CreateStream(0);
+
+    aclTensor *tensors[sizeof(tensorDesc) / sizeof(struct tensorInfo)];
+    void *devMem[sizeof(tensorDesc) / sizeof(struct tensorInfo)];
+    for (auto i = 0; i < sizeof(tensorDesc) / sizeof(struct tensorInfo); i++) {
+        void *data;
+        struct tensorInfo *info = &(tensorDesc[i]);
+        int64_t size = GetDataSize(info);
+        if (size == 0) {
+            tensors[i] = NULL;
+            devMem[i] = NULL;
+            continue;
+        }
+        CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        // read input
+        if (i < SRC_SIZE) {
+            size_t inputSize = size;
+            void *dataHost;
+            CHECK_ACL(aclrtMallocHost((void **)(&dataHost), inputSize));
+            ReadFile("../input/input_" + ParamNames[i] + ".bin", inputSize, dataHost, inputSize);
+            CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE));
+            CHECK_ACL(aclrtFreeHost(dataHost));
+        }
+        devMem[i] = data;
+        tensors[i] =
+            aclCreateTensor(info->dims, info->dimCnt, info->dtype, NULL, 0, info->fmt, info->dims, info->dimCnt, data);
+    }
+
+    size_t workspaceSize = 0;
+    aclOpExecutor *handle;
+    int32_t ret = 0;
+    ret = aclnnWelfordUpdateCustomGetWorkspaceSize(tensors[0], tensors[1], tensors[INDEX_IN_VAR],
+        tensors[INDEX_OUT_MEAN], tensors[INDEX_OUT_VAR], &workspaceSize, &handle);
+    printf("aclnnWelfordUpdateCustomGetWorkspaceSize ret %u workspace size %lu\n", ret, workspaceSize);
+    void *workspace = nullptr;
+    if (workspaceSize != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+    ret = aclnnWelfordUpdateCustom(workspace, workspaceSize, handle, stream);
+    printf("aclnnWelfordUpdateCustom ret %u\n", ret);
+    if (aclrtSynchronizeStreamWithTimeout(stream, TIMEOUT) != ACL_SUCCESS) {
+        printf("Synchronize stream failed\n");
+    }
+
+    uint8_t *outMeanHost, *outVarHost;
+    int64_t outMeanHostSize = GetDataSize(&(tensorDesc[INDEX_OUT_MEAN]));
+    int64_t outVarHostSize = GetDataSize(&(tensorDesc[INDEX_OUT_VAR]));
+
+    CHECK_ACL(aclrtMallocHost((void **)(&outMeanHost), outMeanHostSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&outVarHost), outVarHostSize));
+
+    CHECK_ACL(aclrtMemcpy(outMeanHost, outMeanHostSize, devMem[INDEX_OUT_MEAN], outMeanHostSize,
+        ACL_MEMCPY_DEVICE_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(outVarHost, outVarHostSize, devMem[INDEX_OUT_VAR], outVarHostSize,
+        ACL_MEMCPY_DEVICE_TO_HOST));
+
+    WriteFile("../output/output_outMeanGm.bin", outMeanHost, outMeanHostSize);
+    WriteFile("../output/output_outVarGm.bin", outVarHost, outVarHostSize);
+
+    bool goldenResult = true;
+    goldenResult &= CompareResult(outMeanHost, outMeanHostSize, ParamNames[INDEX_OUT_MEAN]);
+    goldenResult &= CompareResult(outVarHost, outVarHostSize, ParamNames[INDEX_OUT_VAR]);
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+
+    CHECK_ACL(aclrtFreeHost(outMeanHost));
+    CHECK_ACL(aclrtFreeHost(outVarHost));
+
+    for (auto i = 0; i < sizeof(tensorDesc) / sizeof(struct tensorInfo); i++) {
+        if (!tensors[i])
+            continue;
+        if (devMem[i]) {
+            CHECK_ACL(aclrtFree(devMem[i]));
+        }
+        aclDestroyTensor(tensors[i]);
+    }
+    DestroyStream(stream, 0);
+    return 0;
+}
diff --git a/examples/readme.md b/examples/readme.md
index b87070f404d9ffeffbb4d5393921e2845774bdd2..ebd14ef84f1cd01cc563983f63bba218aa8ecda6 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -53,6 +53,10 @@
         <td><a href="./normalization/layernorm_grad"> layernorm_grad</td>
         <td> 计算layernorm的反向传播梯度。 </td>
     </tr>
+    <tr>
+        <td><a href="./normalization/welford_update"> welford_update</td>
+        <td> Welford算法的前处理，一种在线计算均值和方差的方法。 </td>
+    </tr>
     <tr>
         <th rowspan="1"><a href="./pad"> pad</th>
         <td><a href="./pad/broadcast"> broadcast</td>
diff --git a/impl/kfc/kernel_kfc.h b/impl/kfc/kernel_kfc.h
index 5972e904a095bd1876c52e0fae158cacf7fc1876..5f7ad2beeb46cdbb3281ae09c1f1245763a4b33b 100644
--- a/impl/kfc/kernel_kfc.h
+++ b/impl/kfc/kernel_kfc.h
@@ -25,7 +25,7 @@
 #include "lib/matmul/matmul_client.h"
 #include "../matmul/matmul_server.h"
 #endif
-namespace Gemm {
+namespace AscendC {
     constexpr uint16_t WORKSPACE_SYNC_ID = 15;
     __aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace)
     {
@@ -296,5 +296,5 @@ __aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const
 }
 }; // namespace AscendC
 // Compatible with the previously used matmul namespace
-namespace matmul = Gemm;
+namespace matmul = AscendC;
 #endif
diff --git a/impl/kfc/kfc_register_obj.h b/impl/kfc/kfc_register_obj.h
index aee06e79e21e58d270f28718e7b851321505ef67..38a32c37f1184b2837c6c5680ec254b9748de9ac 100644
--- a/impl/kfc/kfc_register_obj.h
+++ b/impl/kfc/kfc_register_obj.h
@@ -205,7 +205,7 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b)
     AscendC::SetMatrixKfc(tpipe, &__kfcClient__, 0, workspace, __VA_ARGS__);                              \
     AscendC::AscendCTimeStamp(static_cast<uint32_t>(AscendC::TimeStampId::TIME_STAMP_MATMUL_MATRIX_KFC)); \
     if constexpr (!asEnableMixDualMaster) {                                                               \
-        AscendC::WaitEvent(Gemm::WORKSPACE_SYNC_ID);                                                      \
+        AscendC::WaitEvent(AscendC::WORKSPACE_SYNC_ID);                                                      \
     }                                                                                                     \
     AscendC::AscendCTimeStamp(static_cast<uint32_t>(AscendC::TimeStampId::TIME_STAMP_MATMUL_WAIT_EVE))
 
@@ -224,7 +224,7 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b)
     }                                                                                              \
     AscendC::SetMatrixKfc(tpipe, &__kfcClient__, 0, workspace, __VA_ARGS__);                       \
     if constexpr (!asEnableMixDualMaster) {                                                        \
-        AscendC::WaitEvent(Gemm::WORKSPACE_SYNC_ID);                                               \
+        AscendC::WaitEvent(AscendC::WORKSPACE_SYNC_ID);                                               \
     }
 
 #endif
diff --git a/impl/matmul/matmul_call_back.h b/impl/matmul/matmul_call_back.h
index 9796aa31abb66cd1672bf9819e678deb70786e6a..12914ecd07f72749ce6283f3ee31e2e8e4791073 100644
--- a/impl/matmul/matmul_call_back.h
+++ b/impl/matmul/matmul_call_back.h
@@ -15,8 +15,8 @@
 #ifndef LIB_MATMUL_MATMUL_CALL_BACK_H
 #define LIB_MATMUL_MATMUL_CALL_BACK_H
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 template <void (*DataCopyOut)(const __gm__ void* gm, const LocalTensor<int8_t> &co1Local,
         const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr,
         void (*CopyA1)(const LocalTensor<int8_t> &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK,
@@ -32,5 +32,5 @@ struct MatmulCallBackFunc {
         int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1;
 };
 
-} // namespace Gemm
+} // namespace AscendC
 #endif
diff --git a/impl/matmul/matmul_constant_tiling_impl.h b/impl/matmul/matmul_constant_tiling_impl.h
index 31f09d14fc14db5298b061ded439d6f357b90a7d..a9050d6fddd3f061dbef440eab4873f2b2873d15 100644
--- a/impl/matmul/matmul_constant_tiling_impl.h
+++ b/impl/matmul/matmul_constant_tiling_impl.h
@@ -19,9 +19,8 @@
 #include "matmul_utils.h"
 #include "kernel_operator.h"
 
-namespace Gemm {
-using namespace AscendC;
-
+namespace AscendC {
+namespace Impl {
 constexpr int32_t C0_BYTE_SIZE = 32;
 constexpr int32_t HW_C0 = 16;
 constexpr int32_t DB_ON = 2;
@@ -36,6 +35,7 @@ constexpr int32_t L1_SIZE = 512 * 1024;
 #else
 constexpr int32_t L1_SIZE = 512 * 1024;
 #endif
+}
 
 template <const auto& TILING>
 struct MatmulTiling {
@@ -446,17 +446,17 @@ __aicore__ inline constexpr MatmulConfig ToMatmulConfig(const MatmulApiStaticTil
 template <typename T>
 __aicore__ constexpr int32_t GetReduceC0Size()
 {
-    return C0_BYTE_SIZE / GetBitSize<T>() * ONE_BYTE_BIT_SIZE;
+    return Impl::C0_BYTE_SIZE / GetBitSize<T>() * ONE_BYTE_BIT_SIZE;
 }
 
 __aicore__ constexpr int32_t GetML0(const MatmulConfig &mmCFG)
 {
-    return CeilNoLog<int32_t>(mmCFG.basicM, HW_C0);
+    return CeilNoLog<int32_t>(mmCFG.basicM, Impl::HW_C0);
 }
 
 __aicore__ constexpr int32_t GetNL0(const MatmulConfig &mmCFG)
 {
-    return CeilNoLog<int32_t>(mmCFG.basicN, HW_C0);
+    return CeilNoLog<int32_t>(mmCFG.basicN, Impl::HW_C0);
 }
 
 template <typename A_TYPE>
@@ -472,19 +472,19 @@ __aicore__ constexpr int32_t GetMTE1Loop(const MatmulConfig &mmCFG)
     int32_t nL0 = GetNL0(mmCFG);
     int32_t mL0 = GetML0(mmCFG);
     int32_t kL0 = GetKL0<A_TYPE>(mmCFG);
-    return MIN_MTE1_LOAD / ((nL0 == 1 ? 1 : kL0) + (kL0 == 1 ? 1 : mL0));
+    return Impl::MIN_MTE1_LOAD / ((nL0 == 1 ? 1 : kL0) + (kL0 == 1 ? 1 : mL0));
 }
 
 __aicore__ constexpr int32_t GetMaxMAL1(const MatmulConfig &mmCFG)
 {
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
     int32_t mL0 = GetML0(mmCFG);
     return CeilNoLog<int32_t>(m, mL0);
 }
 
 __aicore__ constexpr int32_t GetMaxNBL1(const MatmulConfig &mmCFG)
 {
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     int32_t nL0 = GetNL0(mmCFG);
     return CeilNoLog<int32_t>(n, nL0);
 }
@@ -494,7 +494,7 @@ __aicore__ constexpr int32_t GetMaxKAL1(const MatmulConfig &mmCFG)
 {
     int32_t mL0 = GetML0(mmCFG);
     int32_t kL0 = GetKL0<A_TYPE>(mmCFG);
-    int32_t maxAL1 = ((MIN_MTE1_LOAD + mL0 - 1) / mL0 + kL0 - 1) / kL0;
+    int32_t maxAL1 = ((Impl::MIN_MTE1_LOAD + mL0 - 1) / mL0 + kL0 - 1) / kL0;
     return MaxValue<int32_t>(maxAL1, GetMTE1Loop<A_TYPE>(mmCFG));
 }
 
@@ -503,7 +503,7 @@ __aicore__ constexpr int32_t GetMaxKBL1(const MatmulConfig &mmCFG)
 {
     int32_t nL0 = GetNL0(mmCFG);
     int32_t kL0 = GetKL0<A_TYPE>(mmCFG);
-    int32_t maxBL1 = ((MIN_MTE1_LOAD + nL0 - 1) / nL0 + kL0 - 1) / kL0;
+    int32_t maxBL1 = ((Impl::MIN_MTE1_LOAD + nL0 - 1) / nL0 + kL0 - 1) / kL0;
     return MaxValue<int32_t>(maxBL1, GetMTE1Loop<A_TYPE>(mmCFG));
 }
 
@@ -613,19 +613,19 @@ __aicore__ constexpr int32_t CalcL1MaxLen(int32_t l1LeftSize, const L1Status &l1
     int32_t maxLen = 1;
     switch (type) {
         case L1TilingType::KAL1_16:
-            maxLen = l1LeftSize / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE);
+            maxLen = l1LeftSize / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE);
             maxLen = AlignDown<int32_t>(maxLen, alignValue);
             break;
         case L1TilingType::KBL1_16:
-            maxLen = l1LeftSize / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE);
+            maxLen = l1LeftSize / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE);
             maxLen = AlignDown<int32_t>(maxLen, alignValue);
             break;
         case L1TilingType::M_AL1:
-            maxLen = l1LeftSize / (Align<int32_t>(l1Status.kAL1, alignValue) * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE);
+            maxLen = l1LeftSize / (Align<int32_t>(l1Status.kAL1, alignValue) * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE);
             break;
         case L1TilingType::N_BL1:
-            maxLen = l1LeftSize / (Align<int32_t>(l1Status.kBL1, alignValue) * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE +
-                GetChannelWise<BIAS_TYPE>(mmCFG) * mmCFG.basicN * C0_BYTE_SIZE);
+            maxLen = l1LeftSize / (Align<int32_t>(l1Status.kBL1, alignValue) * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE +
+                GetChannelWise<BIAS_TYPE>(mmCFG) * mmCFG.basicN * Impl::C0_BYTE_SIZE);
             break;
     }
     return maxLen;
@@ -643,8 +643,8 @@ __aicore__ constexpr L1Status GetL1StatusBothFullLoad(const MatmulConfig &mmCFG,
         return {kAL1, kBL1, 1, 1, 1, 1, 0};
     }
     L1Status l1Status {kAL1, kBL1, GetMaxMAL1(mmCFG), GetMaxNBL1(mmCFG), 1, 1, INT32_MAX};
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) <= l1Size) {
         int32_t loadSize = (PhyPosIsL1(A_TYPE::pos) ? 0 : m) +
             (PhyPosIsL1(B_TYPE::pos) ? 0 : n);
@@ -675,13 +675,13 @@ __aicore__ constexpr L1Status GetL1StatusAL1FullLoad(const MatmulConfig &mmCFG,
         return {0, 0, 0, 0, 0, 0, INT32_MAX};
     }
     int32_t kaAlignValue = GetKAAlignValue<A_TYPE>();
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
     int32_t aL1Size = MaxValue<int32_t>(Align<int32_t>(k, kaAlignValue), Align<int32_t>(kAL1, kaAlignValue)) *
-        MaxValue<int32_t>(maxMAL1 * mmCFG.basicM, m * HW_C0) * C0_BYTE_SIZE;
+        MaxValue<int32_t>(maxMAL1 * mmCFG.basicM, m * Impl::HW_C0) * Impl::C0_BYTE_SIZE;
     int32_t bL1Size = PhyPosIsL1(A_TYPE::pos) ? l1Size : l1Size - aL1Size;
-    l1Status.dbBL1 = DB_ON;
+    l1Status.dbBL1 = Impl::DB_ON;
     if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-        l1Status.dbBL1 = DB_OFF;
+        l1Status.dbBL1 = Impl::DB_OFF;
     }
     int32_t biasSize = GetBiasL1Size<BIAS_TYPE>(l1Status, mmCFG);
     int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG);
@@ -698,13 +698,13 @@ __aicore__ constexpr L1Status GetL1StatusAL1FullLoad(const MatmulConfig &mmCFG,
         int32_t nRepeat = CeilNoLog<int32_t>(mmCFG.singleCoreN, mmCFG.basicN);
         l1Status.nBL1 = GetNearestFactor(nRepeat, l1Status.nBL1);
         if (l1Status.nBL1 * mmCFG.basicN == mmCFG.singleCoreN) {
-            l1Status.dbBL1 = DB_OFF;
+            l1Status.dbBL1 = Impl::DB_OFF;
         }
     }
     bool invalidL1Status = (l1Status.nBL1 == 0 || l1Status.kBL1 == 0);
     int32_t mRepeat = CeilNoLog<int32_t>(mmCFG.singleCoreM, mmCFG.basicM);
     int32_t possibleMRepeat = (l1Status.kBL1 == k) ? 1 : mRepeat;
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     l1Status.loadSize = invalidL1Status ? INT32_MAX : (PhyPosIsL1(A_TYPE::pos) ? 0 : m) + possibleMRepeat * n;
     return l1Status;
 }
@@ -722,13 +722,13 @@ __aicore__ constexpr L1Status GetL1StatusBL1FullLoad(const MatmulConfig &mmCFG,
         return {0, 0, 0, 0, 0, 0, INT32_MAX};
     }
     int32_t kbAlignValue = GetKBAlignValue<A_TYPE, B_TYPE>();
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     int32_t bL1Size = MaxValue<int32_t>(Align<int32_t>(k, kbAlignValue), Align<int32_t>(kBL1, kbAlignValue)) *
-        MaxValue<int32_t>(maxNBL1 * mmCFG.basicN, n * HW_C0) * C0_BYTE_SIZE;
+        MaxValue<int32_t>(maxNBL1 * mmCFG.basicN, n * Impl::HW_C0) * Impl::C0_BYTE_SIZE;
     int32_t aL1Size = PhyPosIsL1(B_TYPE::pos) ? l1Size : l1Size - bL1Size;
-    l1Status.dbAL1 = DB_ON;
+    l1Status.dbAL1 = Impl::DB_ON;
     if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-        l1Status.dbAL1 = DB_OFF;
+        l1Status.dbAL1 = Impl::DB_OFF;
     }
     int32_t biasSize = GetBiasL1Size<BIAS_TYPE>(l1Status, mmCFG);
     int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG);
@@ -743,13 +743,13 @@ __aicore__ constexpr L1Status GetL1StatusBL1FullLoad(const MatmulConfig &mmCFG,
         int32_t mRepeat = CeilNoLog<int32_t>(mmCFG.singleCoreM, mmCFG.basicM);
         l1Status.mAL1 = GetNearestFactor(mRepeat, l1Status.mAL1);
         if (l1Status.mAL1 * mmCFG.basicM == mmCFG.singleCoreM) {
-            l1Status.dbAL1 = DB_OFF;
+            l1Status.dbAL1 = Impl::DB_OFF;
         }
     }
     bool invalidL1Status = (l1Status.mAL1 == 0 || l1Status.kAL1 == 0);
     int32_t nRepeat = CeilNoLog<int32_t>(mmCFG.singleCoreN, mmCFG.basicN);
     int32_t possibleNRepeat = (l1Status.kAL1 == k) ? 1 : nRepeat;
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
     l1Status.loadSize = invalidL1Status ? INT32_MAX : (PhyPosIsL1(B_TYPE::pos) ? 0 : n) + possibleNRepeat * m;
     return l1Status;
 }
@@ -791,8 +791,8 @@ __aicore__ constexpr L1Status GetL1StatusMFirst(const L1Status &l1Status, const
         l1MFirst, mmCFG, kbAlignValue, L1TilingType::N_BL1), GetMaxNBL1(mmCFG), nRepeat), 1);
     l1MFirst.nBL1 = GetNearestFactor(mRepeat, l1MFirst.nBL1);
     int32_t mL0 = GetML0(mmCFG);
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     l1MFirst.loadSize = m + n * CeilNoLog<int32_t>(m, l1MFirst.mAL1 * mL0);
     return l1MFirst;
 }
@@ -819,8 +819,8 @@ __aicore__ constexpr L1Status GetL1StatusNFirst(const L1Status &l1Status, const
     l1NFirst.mAL1 = GetNearestFactor(mRepeat, l1NFirst.mAL1);
     l1NFirst.nBL1 = GetNearestFactor(mRepeat, l1NFirst.nBL1);
     int32_t nL0 = GetNL0(mmCFG);
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
-    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
+    int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
     l1NFirst.loadSize = n + m * CeilNoLog<int32_t>(n, l1NFirst.nBL1 * nL0);
     return l1NFirst;
 }
@@ -832,26 +832,26 @@ __aicore__ constexpr L1Status GetL1DbNeitherFullLoad(const MatmulConfig &mmCFG,
     int32_t reduceC0Size = GetReduceC0Size<SrcAT>();
     int32_t k = CeilNoLog<int32_t>(mmCFG.singleCoreK, reduceC0Size);
     int32_t kL0 = GetKL0<A_TYPE>(mmCFG);
-    L1Status l1Status {kL0, DB_ON, 1, 1, DB_ON, DB_ON, 0};
+    L1Status l1Status {kL0, Impl::DB_ON, 1, 1, Impl::DB_ON, Impl::DB_ON, 0};
     if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-        l1Status.dbBL1 = DB_OFF;
+        l1Status.dbBL1 = Impl::DB_OFF;
         if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-            l1Status.dbAL1 = DB_OFF;
+            l1Status.dbAL1 = Impl::DB_OFF;
         }
     }
     l1Status.kBL1 = k;
-    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
+    int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
     int32_t mL0 = GetML0(mmCFG);
     bool bothDoubleBuffer = m != mL0 && mmCFG.singleCoreK > mmCFG.basicK &&
         GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size;
     l1Status.kBL1 = kL0;
     if (bothDoubleBuffer) {
-        l1Status.dbAL1 = DB_ON;
-        l1Status.dbBL1 = DB_ON;
+        l1Status.dbAL1 = Impl::DB_ON;
+        l1Status.dbBL1 = Impl::DB_ON;
         if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-            l1Status.dbBL1 = DB_OFF;
+            l1Status.dbBL1 = Impl::DB_OFF;
             if (GetL1Size<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(l1Status, mmCFG) > l1Size) {
-                l1Status.dbAL1 = DB_OFF;
+                l1Status.dbAL1 = Impl::DB_OFF;
             }
         }
     }
@@ -882,8 +882,8 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoadForNZ(const L1Status &l1Nz,
     } else {
         // when NeitherFullLoadMN change the nBL1 and mAL1
         int32_t perK = MinValue<int32_t>((l1Size - biasSize - dequantSize) /
-            (mmCFG.basicM * C0_BYTE_SIZE * l1Status.dbAL1 +
-            mmCFG.basicN * C0_BYTE_SIZE * l1Status.dbBL1) /
+            (mmCFG.basicM * Impl::C0_BYTE_SIZE * l1Status.dbAL1 +
+            mmCFG.basicN * Impl::C0_BYTE_SIZE * l1Status.dbBL1) /
             kL0 * kL0, k);
         const int32_t aAlignedPerK = Align<int32_t>(perK, kaAlignValue);
         const int32_t bAlignedPerK = Align<int32_t>(perK, kbAlignValue);
@@ -928,10 +928,10 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db,
         int32_t bL1Size = GetBL1Size<A_TYPE, B_TYPE>(l1Status, mmCFG);
         int32_t aL1Size = l1Size - bL1Size;
         l1Status.kAL1 = MinValue<int32_t>((aL1Size - biasSize - dequantSize) /
-            (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE), k);
+            (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE), k);
         aL1Times = MaxValue<int32_t>(l1Status.kAL1 / kL0, 1);
         l1Status.kAL1 = aL1Times * kL0;
-        aL1Size = l1Status.kAL1 * l1Status.mAL1 * mmCFG.basicM * C0_BYTE_SIZE * l1Status.dbAL1;
+        aL1Size = l1Status.kAL1 * l1Status.mAL1 * mmCFG.basicM * Impl::C0_BYTE_SIZE * l1Status.dbAL1;
         bL1Size = l1Size - aL1Size;
         l1Status.kBL1 = MinValue<int32_t>((bL1Size - dequantSize - biasSize) / (l1Status.nBL1 * mmCFG.basicN *
             l1Status.dbBL1 * mmCFG.basicK * kL0 * GetBitSize<SrcBT>() / ONE_BYTE_BIT_SIZE), k);
@@ -944,11 +944,11 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db,
         int32_t aL1Size = GetAL1Size<A_TYPE>(l1Status, mmCFG);
         int32_t bL1Size = l1Size - aL1Size;
         l1Status.kBL1 = MinValue<int32_t>((bL1Size - biasSize - dequantSize) /
-            (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE), k);
+            (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE), k);
         int32_t bL1Times = MaxValue<int32_t>(l1Status.kBL1 / kL0, 1);
         bL1Times = GetNearestFactor(aL1Times, bL1Times);
         l1Status.kBL1 = bL1Times * kL0;
-        bL1Size = l1Status.kBL1 * l1Status.nBL1 * mmCFG.basicN * C0_BYTE_SIZE * l1Status.dbBL1;
+        bL1Size = l1Status.kBL1 * l1Status.nBL1 * mmCFG.basicN * Impl::C0_BYTE_SIZE * l1Status.dbBL1;
         aL1Size = l1Size - bL1Size;
         l1Status.kAL1 = MinValue<int32_t>((aL1Size - dequantSize - biasSize) / (l1Status.mAL1 * mmCFG.basicM *
             l1Status.dbAL1 * mmCFG.basicK * kL0 * GetBitSize<SrcAT>() / ONE_BYTE_BIT_SIZE), k);
@@ -999,8 +999,8 @@ __aicore__ constexpr L1Status GetL1StatusNeitherFullLoad(const MatmulConfig &mmC
     if (l1Status.kAL1 < k && l1Status.kBL1 < k) {
         l1Status.mAL1 = 1;
         l1Status.nBL1 = 1;
-        int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, HW_C0);
-        int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, HW_C0);
+        int32_t m = CeilNoLog<int32_t>(mmCFG.singleCoreM, Impl::HW_C0);
+        int32_t n = CeilNoLog<int32_t>(mmCFG.singleCoreN, Impl::HW_C0);
         int32_t nL0 = GetNL0(mmCFG);
         l1Status.loadSize = m * CeilNoLog<int32_t>(n, nL0) + n * CeilNoLog<int32_t>(m, nL0);
     }
@@ -1077,12 +1077,12 @@ __aicore__ constexpr int32_t GetIterateOrder(const L1Status &l1Status, const Mat
 
 __aicore__ constexpr int32_t GetL0ADb(const MatmulConfig &mmCFG, uint32_t l0ASize)
 {
-    return (mmCFG.basicM * C0_BYTE_SIZE > l0ASize / DB_ON) ? DB_OFF : DB_ON;
+    return (mmCFG.basicM * Impl::C0_BYTE_SIZE > l0ASize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON;
 }
 
 __aicore__ constexpr int32_t GetL0BDb(const MatmulConfig &mmCFG, uint32_t l0BSize)
 {
-    return (mmCFG.basicN * C0_BYTE_SIZE > l0BSize / DB_ON) ? DB_OFF : DB_ON;
+    return (mmCFG.basicN * Impl::C0_BYTE_SIZE > l0BSize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON;
 }
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
@@ -1139,5 +1139,5 @@ __aicore__ constexpr int32_t GetTransLength(const MatmulConfig &mmCFG, const L1S
     }
     return MaxValue<int32_t>(a1Length, b1Length, c1Length, biasLength);
 }
-} // namespace Gemm
+} // namespace AscendC
 #endif // IMPL_MATMUL_MATMUL_CONSTANT_TILING_IMPL_H
\ No newline at end of file
diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h
index 1fd7b996146a265d07610403d5cbce44cd566eee..2e7ebcb1c01a3d500660fc1c605abb59bad86c8e 100644
--- a/impl/matmul/matmul_impl.h
+++ b/impl/matmul/matmul_impl.h
@@ -20,9 +20,10 @@
 #include "../../impl/matmul/modules/matmul_module.h"
 #include "../../impl/matmul/modules/matmul_param.h"
 #include "../../impl/matmul/matmul_macro_def.h"
-namespace Gemm {
-
+namespace AscendC {
+namespace Impl {
 constexpr int32_t DOUBLE_SIZE = 2;
+}
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG = CFG_NORM,
 class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
@@ -641,13 +642,13 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     });
 #if __CCE_AICORE__ == 220
     if constexpr ((DoMatmulNorm(MM_CFG) || DoMatmulMDL(MM_CFG)) && ToMatmulConfig(MM_CFG).isA2B2Shared) {
-        ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_ / DB_FACTOR), {
+        ASCENDC_ASSERT((var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0ASize_ / Impl::DB_FACTOR), {
             KERNEL_LOG(KERNEL_ERROR, "baseM * baseK is %d , which should be not larger than A2 Size / 2 when isA2B2Shared is enable %d",
-                var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT), L0ASize_ / DB_FACTOR);
+                var.tiling_.GetBaseM() * var.tiling_.GetBaseK() * sizeof(SrcT), L0ASize_ / Impl::DB_FACTOR);
         });
-        ASCENDC_ASSERT((var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0BSize_ / DB_FACTOR), {
+        ASCENDC_ASSERT((var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT) <= L0BSize_ / Impl::DB_FACTOR), {
             KERNEL_LOG(KERNEL_ERROR, "baseN * baseK is %d , which should be not larger than B2 Size / 2 when isA2B2Shared is enable %d",
-                var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT), L0BSize_ / DB_FACTOR);
+                var.tiling_.GetBaseN() * var.tiling_.GetBaseK() * sizeof(SrcT), L0BSize_ / Impl::DB_FACTOR);
         });
     }
 #endif
@@ -952,7 +953,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     uint32_t lenFactor = 1;
 #if __CCE_AICORE__ >= 220
     if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
-        lenFactor = DOUBLE_SIZE;
+        lenFactor = Impl::DOUBLE_SIZE;
     }
 #endif
     MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, lenFactor);
@@ -1053,7 +1054,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
         if (var.tiling_.IsBias()) {
             if constexpr (A_TYPE::layout == LayoutMode::NONE && ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT
                 && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
-                var.tpipe_->InitBuffer(var.qidBias_, 1, DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT));
+                var.tpipe_->InitBuffer(var.qidBias_, 1, Impl::DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT));
             } else {
                 var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_.GetBaseN() * sizeof(BiasT));
             }
@@ -1175,7 +1176,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     uint32_t lenFactor = 1;
 #if __CCE_AICORE__ >= 220
     if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
-        lenFactor = DOUBLE_SIZE;
+        lenFactor = Impl::DOUBLE_SIZE;
     }
 #endif
     MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, lenFactor);
@@ -1192,7 +1193,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
         if (var.tiling_.IsBias()) {
             if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT && ToMatmulConfig(MM_CFG).iterateOrder == IterateOrder::ORDER_M) {
-                var.tpipe_->InitBuffer(var.qidBias_, 1, DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT));
+                var.tpipe_->InitBuffer(var.qidBias_, 1, Impl::DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT));
             } else {
                 var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_.GetBaseN() * sizeof(BiasT));
             }
@@ -2973,8 +2974,8 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto&
     MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
 __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNormWithMdb(int kInner)
 {
-    int dbLoop = (var.curM_ + 1 == var.mIter_) ? 1 : DOUBLE_SIZE;
-    MatmulInstr::useL0PingPong_ = (dbLoop == DOUBLE_SIZE) ? 1 : 0;
+    int dbLoop = (var.curM_ + 1 == var.mIter_) ? 1 : Impl::DOUBLE_SIZE;
+    MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0;
     LocalTensor<BiasT> bias;
     bool isBiasEnable = false;
     if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
@@ -2989,7 +2990,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     auto b1 = MATMUL_MODULE(ChosenCopyCubeInB)->LoadData(kInner, var.curN_, var.baseUseK_, var.baseUseN_);
     for (int dbInner = 0; dbInner < dbLoop; dbInner++) {
         if (dbInner > 0) {
-            if (var.curM_ + DOUBLE_SIZE == var.mIter_) {
+            if (var.curM_ + Impl::DOUBLE_SIZE == var.mIter_) {
                 // if tailM_ != baseM, reset sAL1M_ and sMadM_
                 dbUsedM = var.tailM_;
                 MatmulInstr::sAL1M_ = CeilAlign(dbUsedM, BLOCK_CUBE);
@@ -3044,8 +3045,8 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto&
     MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
 __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>::ComputeNormWithNdb(int kInner)
 {
-    int dbLoop = (var.curN_ + 1 == var.nIter_) ? 1 : DOUBLE_SIZE;
-    MatmulInstr::useL0PingPong_ = (dbLoop == DOUBLE_SIZE) ? 1 : 0;
+    int dbLoop = (var.curN_ + 1 == var.nIter_) ? 1 : Impl::DOUBLE_SIZE;
+    MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0;
     LocalTensor<BiasT> bias;
     bool isBiasEnable = false;
     if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
@@ -3060,7 +3061,7 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
     auto a1 = MATMUL_MODULE(ChosenCopyCubeInA)->LoadData(var.curM_, kInner, var.baseUseM_, var.baseUseK_);
     for (int dbInner = 0; dbInner < dbLoop; dbInner++) {
         if (dbInner > 0) {
-            if (var.curN_ + DOUBLE_SIZE == var.nIter_) {
+            if (var.curN_ + Impl::DOUBLE_SIZE == var.nIter_) {
                 // if tailN_ != baseN, reset sBL1N_ and sMadN_
                 dbUsedN = var.tailN_;
                 MatmulInstr::sBL1N_ = CeilAlign(dbUsedN, BLOCK_CUBE);
@@ -6697,5 +6698,5 @@ __aicore__ inline void MatmulImplBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG,
 }
 
 #endif
-} // namespace Gemm
+} // namespace AscendC
 #endif
diff --git a/impl/matmul/matmul_macro_def.h b/impl/matmul/matmul_macro_def.h
index 6a4be597f3e10a8f441b665a4aa2753280094caf..f1ab029adca99575cceefda7575dc6171b875c47 100644
--- a/impl/matmul/matmul_macro_def.h
+++ b/impl/matmul/matmul_macro_def.h
@@ -22,7 +22,7 @@
 #include "matmul_macro_v200_impl.h"
 #include "modules/matmul_param.h"
 
-namespace Gemm {
+namespace AscendC {
 
 /* **************************************************************************************************
  * MatmulMacroImpl                                             *
@@ -87,5 +87,5 @@ struct MatmulMacroImpl<IMPL, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmu
         BIAS_TYPE::isTrans, MM_CFG>;
 };
 
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_MACRO_DEF_H_
\ No newline at end of file
diff --git a/impl/matmul/matmul_macro_utils.h b/impl/matmul/matmul_macro_utils.h
index b18a5f993df2293d7d5bc9309551196cf7a4b368..7abb77b8abb9e1325ce91007d27605615df1418e 100644
--- a/impl/matmul/matmul_macro_utils.h
+++ b/impl/matmul/matmul_macro_utils.h
@@ -15,6 +15,8 @@
 #ifndef IMPL_MATMUL_MATMUL_MACRO_UTILS_H
 #define IMPL_MATMUL_MATMUL_MACRO_UTILS_H
 
+namespace AscendC {
+namespace Impl {
 #define HW_N0 16
 #define HW_M0 16
 #define ALIGN_NUM 16
@@ -35,7 +37,8 @@ constexpr int32_t SHIFT_48_BIT = 48;
 constexpr int32_t SHIFT_56_BIT = 56;
 constexpr int32_t CTRL_51_BIT = 51;
 constexpr uint8_t padList[4] = {0, 0, 0, 0};
-namespace Gemm {
+}
+
 __aicore__ inline uint16_t CeilDiv(uint16_t num1, uint16_t num2)
 {
     ASSERT(num2 > 0);
@@ -65,9 +68,9 @@ template <typename A_TYPE, typename B_TYPE, const auto& MM_CFG>
 __aicore__ inline constexpr bool IsL0ACache()
 {
     if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
-        return ToMatmulConfig(MM_CFG).basicM * ToMatmulConfig(MM_CFG).basicK * sizeof(A_TYPE) * DB_FACTOR <= L0AUF_SIZE;
+        return ToMatmulConfig(MM_CFG).basicM * ToMatmulConfig(MM_CFG).basicK * sizeof(A_TYPE) * Impl::DB_FACTOR <= L0AUF_SIZE;
     } else {
-        return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * DB_FACTOR;
+        return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR;
     }
 }
 
@@ -75,9 +78,9 @@ template <typename A_TYPE, typename B_TYPE, const auto& MM_CFG>
 __aicore__ inline constexpr bool IsL0BCache()
 {
     if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) {
-        return ToMatmulConfig(MM_CFG).basicK * ToMatmulConfig(MM_CFG).basicN * sizeof(B_TYPE) * DB_FACTOR <= L0BUF_SIZE;
+        return ToMatmulConfig(MM_CFG).basicK * ToMatmulConfig(MM_CFG).basicN * sizeof(B_TYPE) * Impl::DB_FACTOR <= L0BUF_SIZE;
     } else {
-        return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * DB_FACTOR;
+        return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR;
     }
 }
 
@@ -96,5 +99,5 @@ __aicore__ inline constexpr bool IsL0Cache()
     }
     return IsL0ACache<A_TYPE, B_TYPE, MM_CFG>() || IsL0BCache<A_TYPE, B_TYPE, MM_CFG>();
 }
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/matmul_macro_v200_impl.h b/impl/matmul/matmul_macro_v200_impl.h
index 64646f546fc8851b5bacf0eedd42ba45e0084e32..f20dba135451c4a27ad88a19b89c2c991236698c 100644
--- a/impl/matmul/matmul_macro_v200_impl.h
+++ b/impl/matmul/matmul_macro_v200_impl.h
@@ -18,9 +18,7 @@
 #include "kernel_operator.h"
 #include "matmul_macro_utils.h"
 
-namespace Gemm {
-using namespace AscendC;
-
+namespace AscendC {
 // ===========mad template=================/
 // Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset
 template <typename IMPL, typename C_T, typename A_T, typename B_T, uint16_t UNIFLAG_EN = 0, uint16_t L0AB_USING_HSET = 0>
@@ -145,11 +143,11 @@ inline __aicore__ void MacroMatmulV200<IMPL, C_T, A_T, B_T, UNIFLAG_EN, L0AB_USI
 {
     if (ssAmatrixTranspose_ > 0) {
         uint16_t wAlign = CeilAlign(sAL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
         uint16_t wAlign = CeilAlign(sAL1M_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
     if (isGemv_) {
         int32_t fracSize = BYTE_PER_FRACTAL / sizeof(A_T);
@@ -195,10 +193,10 @@ inline __aicore__ void MacroMatmulV200<IMPL, C_T, A_T, B_T, UNIFLAG_EN, L0AB_USI
 {
     if (ssBmatrixTranspose_ < 1) {
         uint16_t wAlign = CeilAlign(sBL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         uint16_t wAlign = CeilAlign(sBL1N_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
     bool isTail = kC0Tail != 0;
     uint16_t nFraC0 = CeilDiv(sMadN_, HW_N0);
@@ -376,5 +374,5 @@ inline __aicore__ void MacroMatmulV200<IMPL, C_T, A_T, B_T, UNIFLAG_EN, L0AB_USI
         ssBl0PingPongFlag_ += useL0PingPong_;
     }
 }
-} // namespace Gemm
+} // namespace AscendC
 #endif
diff --git a/impl/matmul/matmul_macro_v220_basic_impl.h b/impl/matmul/matmul_macro_v220_basic_impl.h
index 7cf51930de9ffd0141191b918817e47d46660ffe..e2cf3524235e644ea3a69223aa5320b1dcdb4b7d 100644
--- a/impl/matmul/matmul_macro_v220_basic_impl.h
+++ b/impl/matmul/matmul_macro_v220_basic_impl.h
@@ -18,6 +18,8 @@
 #include "kernel_operator.h"
 #include "lib/matmul/tiling.h"
 
+namespace AscendC {
+namespace Impl {
 #define HW_N0 16
 #define HW_M0 16
 #define ALIGN_NUM 16
@@ -30,10 +32,7 @@
 #define L0B_PONG_D (L0BUF_SIZE / 2)
 #define BIAS_PING_D 0
 #define BIAS_PONG_D (BIAS_BUF_SIZE / 2)
-
-
-namespace Gemm {
-using namespace AscendC;
+}
 
 __aicore__ inline uint16_t CeilDivNum(uint16_t num1, uint16_t num2)
 {
@@ -351,14 +350,14 @@ inline __aicore__ void MacroMatmulBasic<IMPL, C_T, A_T, B_T, BIAS_T, isBias, MM_
         if constexpr (IsSameType<C_T, float>::value && IsSameType<A_T, float>::value) {
             kDirectionAlign_ = 1;
         }
-        Load3DSetFMatrixCal(1, sAL1K_, padList);
+        Load3DSetFMatrixCal(1, sAL1K_, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
-        Load3DSetFMatrixCal(1, ToMatmulConfig(MM_CFG).basicM, padList);
+        Load3DSetFMatrixCal(1, ToMatmulConfig(MM_CFG).basicM, Impl::padList);
     }
 
     if (ssBmatrixTranspose1_ < 1) {
-        Load3DSetFMatrixBCal(1, sBL1K_, padList);
+        Load3DSetFMatrixBCal(1, sBL1K_, Impl::padList);
     }
     if constexpr (isBias) {
         if (sL0cInit_) {
@@ -404,5 +403,5 @@ inline __aicore__ void MacroMatmulBasic<IMPL, C_T, A_T, B_T, BIAS_T, isBias, MM_
     ssAl0PingPongFlag_ += 1;
     ssBl0PingPongFlag_ += 1;
 }
-} // namespace Gemm
+} // namespace AscendC
 #endif
\ No newline at end of file
diff --git a/impl/matmul/matmul_macro_v220_impl.h b/impl/matmul/matmul_macro_v220_impl.h
index a8e32d6fabfd8bcf1a51d515142589ba9d55a8f3..8cb3754ad27f3052ee29242959b9b00398acd982 100644
--- a/impl/matmul/matmul_macro_v220_impl.h
+++ b/impl/matmul/matmul_macro_v220_impl.h
@@ -17,8 +17,8 @@
 
 #include "matmul_macro_v220_intf.h"
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 
 // ===========mad template=================/
 // Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset
@@ -597,16 +597,16 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
             kDirectionAlign_ = 1;
         }
         uint16_t wAlign = CeilAlign(sAL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
         uint16_t wAlign = CeilAlign(sAL1M_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
 
     if (ssBmatrixTranspose_ < 1) {
         uint16_t wAlign = CeilAlign(sBL1K_, HW_M0);
-        Load3DSetFMatrixBCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixBCal(sFmH_, wAlign, Impl::padList);
     }
 
     if constexpr (!noBias) {
@@ -839,16 +839,16 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
             kDirectionAlign_ = 1;
         }
         uint16_t wAlign = CeilAlign(sAL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
         uint16_t wAlign = CeilAlign(sAL1M_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
 
     if (ssBmatrixTranspose_ < 1) {
         uint16_t wAlign = CeilAlign(sBL1K_, HW_M0);
-        Load3DSetFMatrixBCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixBCal(sFmH_, wAlign, Impl::padList);
     }
 
     uint16_t usedK = sMad0K_;
@@ -934,16 +934,16 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
             kDirectionAlign_ = 1;
         }
         uint16_t wAlign = CeilAlign(sAL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
         uint16_t wAlign = CeilAlign(sAL1M_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
 
     if (ssBmatrixTranspose_ < 1) {
         uint16_t wAlign = CeilAlign(sBL1K_, HW_M0);
-        Load3DSetFMatrixBCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixBCal(sFmH_, wAlign, Impl::padList);
     }
 
     uint16_t usedK = sMad0K_;
@@ -1022,5 +1022,5 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
     }
 }
 
-} // namespace Gemm
+} // namespace AscendC
 #endif
\ No newline at end of file
diff --git a/impl/matmul/matmul_macro_v220_intf.h b/impl/matmul/matmul_macro_v220_intf.h
index 36a9be254b42783c54fdeca7bc760fd5df0c98e2..b0bf6fc4636d29acd1ee5e39088650628680fc12 100644
--- a/impl/matmul/matmul_macro_v220_intf.h
+++ b/impl/matmul/matmul_macro_v220_intf.h
@@ -18,8 +18,8 @@
 #include "kernel_operator.h"
 #include "matmul_macro_utils.h"
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 __BLOCK_LOCAL__ __inline__ uint64_t gA2B2PingPongFlag_;
 constexpr uint32_t PINGPONG_BUFFER_NUM = 2;
 
@@ -143,5 +143,5 @@ private:
         const LocalTensor<C_T> &cMatrix, uint16_t mmadK, uint8_t unitFlag, bool l0c_initial) {}
 };
 
-} // namespace Gemm
+} // namespace AscendC
 #endif
\ No newline at end of file
diff --git a/impl/matmul/matmul_macro_v220_l0cache_impl.h b/impl/matmul/matmul_macro_v220_l0cache_impl.h
index e8a9acd271136dbf4dcc89d7ab73f18f58b4fbb6..c73b2515e3ddd6882303ce1b3f53d55e44675811 100644
--- a/impl/matmul/matmul_macro_v220_l0cache_impl.h
+++ b/impl/matmul/matmul_macro_v220_l0cache_impl.h
@@ -17,8 +17,8 @@
 
 #include "matmul_macro_v220_intf.h"
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 
 // ===========mad template=================/
 // Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset
@@ -592,14 +592,14 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
     } else if (hitCachePong) {
         ssAl0PingPongFlag_ = 1;
     } else {
-        ssAl0PingPongFlag_ = cacheProcA_ % DB_FACTOR == 0 ? 0 : 1;
+        ssAl0PingPongFlag_ = cacheProcA_ % Impl::DB_FACTOR == 0 ? 0 : 1;
     }
 
     if (ssAl0PingPongFlag_) {
         if constexpr (IsSameType<A_T, int4b_t>::value) {
             l0a = l0a[L0AUF_SIZE / sizeof(A_T)];
         } else {
-            l0a = l0a[L0AUF_SIZE / DB_FACTOR / sizeof(A_T)];
+            l0a = l0a[L0AUF_SIZE / Impl::DB_FACTOR / sizeof(A_T)];
         }
     }
 
@@ -635,7 +635,7 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
     } else if (hitCachePong) {
         ssBl0PingPongFlag_ = 1;
     } else {
-        ssBl0PingPongFlag_ = cacheProcB_ % DB_FACTOR == 0 ? 0 : 1;
+        ssBl0PingPongFlag_ = cacheProcB_ % Impl::DB_FACTOR == 0 ? 0 : 1;
     }
 
     if (ssBl0PingPongFlag_) {
@@ -645,7 +645,7 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
             }
         } else {
             if constexpr (!intraBlockPartSum) {
-                l0b = l0b[L0BUF_SIZE / DB_FACTOR / sizeof(B_T)];
+                l0b = l0b[L0BUF_SIZE / Impl::DB_FACTOR / sizeof(B_T)];
             }
         }
     }
@@ -712,16 +712,16 @@ inline __aicore__ void MacroMatmul<IMPL, C_T, A_T, B_T, BIAS_T, UNIFLAG_EN, GEMV
             kDirectionAlign_ = 1;
         }
         uint16_t wAlign = CeilAlign(sAL1K_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     } else {
         // fmatrix w should be 16 aligned
         uint16_t wAlign = CeilAlign(sAL1M_, HW_M0);
-        Load3DSetFMatrixCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList);
     }
 
     if (ssBmatrixTranspose_ < 1) {
         uint16_t wAlign = CeilAlign(sBL1K_, HW_M0);
-        Load3DSetFMatrixBCal(sFmH_, wAlign, padList);
+        Load3DSetFMatrixBCal(sFmH_, wAlign, Impl::padList);
     }
 
     if constexpr (!noBias) {
diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h
index dde7545118537ef673d65208aed965c0a77dc508..41985e553c49a2e26688854000e47ac199f066d0 100644
--- a/impl/matmul/matmul_server.h
+++ b/impl/matmul/matmul_server.h
@@ -18,8 +18,8 @@
 #include "../../lib/matmul/matmul.h"
 #include "kernel_operator.h"
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 template <bool IS_IBSHARE> struct IBShareCache {
     __aicore__ inline IBShareCache() {};
 };
@@ -879,14 +879,14 @@ private:
 template <const auto& MM_CFG = CFG_NORM>
 __aicore__ inline constexpr bool IsSharedMatmul()
 {
-    if constexpr (!Gemm::ToMatmulConfig(MM_CFG).enableInit ||
-        Gemm::ToMatmulConfig(MM_CFG).enableMixDualMaster) {
+    if constexpr (!AscendC::ToMatmulConfig(MM_CFG).enableInit ||
+        AscendC::ToMatmulConfig(MM_CFG).enableMixDualMaster) {
         return true;
     }
     return false;
 }
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE,
-    const auto& MM_CFG = CFG_NORM, class MM_CB = Gemm::MatmulCallBackFunc<nullptr, nullptr, nullptr>,
+    const auto& MM_CFG = CFG_NORM, class MM_CB = AscendC::MatmulCallBackFunc<nullptr, nullptr, nullptr>,
     MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
 struct MatmulInstBase {
     __aicore__ inline MatmulInstBase(){};
@@ -895,13 +895,13 @@ template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto&
     MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
 struct MatmulInstShared : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
     __aicore__ inline MatmulInstShared(){};
-    Gemm::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[1];
+    AscendC::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[1];
 };
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG, class MM_CB,
     MATMUL_POLICY_TEMPLATE_OF(MATMUL_POLICY)>
 struct MatmulInst : MatmulInstBase<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> {
     __aicore__ inline MatmulInst(){};
-    Gemm::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[MIX_NUM];
+    AscendC::MatmulService<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> cubeObj[MIX_NUM];
 };
 
 template <bool SHARED, class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG,
@@ -929,7 +929,7 @@ class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, template<const auto
 class MatmulServiceAux {};
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
-class MM_CB = Gemm::MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
+class MM_CB = AscendC::MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
 class MatmulServiceAuxBase {
     using SrcT = typename A_TYPE::T;
     using SrcAT = typename A_TYPE::T;
@@ -1257,5 +1257,5 @@ class MatmulServiceAux<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_
 public:
     __aicore__ inline MatmulServiceAux() {}
 };
-} // namespace Gemm
+} // namespace AscendC
 #endif // __MATMUL_SERVER_H__
\ No newline at end of file
diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h
index 3cbdc36e36c785720a705ac545c1d3a891701895..9bc86fc1d76764cf0c7e5db598241cc671e478fb 100644
--- a/impl/matmul/matmul_utils.h
+++ b/impl/matmul/matmul_utils.h
@@ -11,8 +11,8 @@
 #ifndef IMPL_MATMUL_MATMUL_UTILS_H
 #define IMPL_MATMUL_MATMUL_UTILS_H
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 
 template <typename SrcT> __aicore__ inline constexpr int32_t GetC0Size()
 {
@@ -88,7 +88,7 @@ struct DataCopyOutParams {
     uint64_t cbufWorkspaceAddr = 0;
 };
 
-constexpr int32_t ONE_BYTE_BIT_SIZE = 8;
+namespace Impl {
 constexpr int32_t QUEUE_DEPTH = 1;
 constexpr int32_t NZ_MASK_VAlUE = 2;
 constexpr int32_t FLOAT_FACTOR = 2;
@@ -96,11 +96,8 @@ constexpr int32_t B4_C0SIZE = 64;
 constexpr int32_t B8_C0SIZE = 32;
 constexpr int32_t B32_C0SIZE = 8;
 constexpr int32_t B16_C0SIZE = 16;
-constexpr int32_t CTRL_46_BIT = 46;
-constexpr int32_t CTRL_47_BIT = 47;
 constexpr int32_t L0_SIZE = 64 * 1024;
 constexpr int32_t MAX_BLOCK_COUNT_SIZE = 4095;
-constexpr int32_t INT4_TWO = 2;
 #if __CCE_AICORE__ < 200
 constexpr int32_t DB_FACTOR = 1;
 #else
@@ -121,6 +118,10 @@ const LocalTensor<T> NULL_TENSOR;
 //           equal: sizeof(KfcMsg) * MAX_MATMUL_OBJ * MAX_AIV_NUM +
 //           equal: TOTAL_UB_SIZE * MAX_AIV_NUM
 constexpr int64_t GM_OFFSET = 128 * 2 * 64 * 50 + 128 * 8 * 50 + 192 * 1024 * 50;
+}
+
+template<typename T>
+const LocalTensor<T> NULL_TENSOR;
 
 template <typename T> struct GetDstType {
     using Type = T;
@@ -153,7 +154,7 @@ int32_t constexpr GetNdNzMask(CubeFormat dstFormat, CubeFormat srcFormat)
     if ((srcFormat == CubeFormat::ND) && (dstFormat == CubeFormat::NZ)) {
         return 1;
     } else if ((srcFormat == CubeFormat::NZ) && (dstFormat == CubeFormat::ND)) {
-        return NZ_MASK_VAlUE;
+        return Impl::NZ_MASK_VAlUE;
     }
     return 0;
 }
@@ -162,7 +163,7 @@ template <typename SrcT>
 __aicore__ inline constexpr static int32_t AuxGetFactor()
 {
     if (sizeof(SrcT) == sizeof(float)) {
-        return FLOAT_FACTOR;
+        return Impl::FLOAT_FACTOR;
     }
     return 1;
 }
@@ -171,13 +172,13 @@ template <typename SrcT>
 __aicore__ inline constexpr static int32_t AuxGetC0Size()
 {
     if (sizeof(SrcT) == sizeof(float)) {
-        return B32_C0SIZE;
+        return Impl::B32_C0SIZE;
     } else if (IsSameType<SrcT, int8_t>::value) {
-        return B8_C0SIZE;
+        return Impl::B8_C0SIZE;
     } else if (IsSameType<SrcT, int4b_t>::value) {
-        return B4_C0SIZE;
+        return Impl::B4_C0SIZE;
     }
-    return B16_C0SIZE;
+    return Impl::B16_C0SIZE;
 }
 
 __aicore__ constexpr bool DoMatmulNorm(MatmulConfig mmCFG)
@@ -391,8 +392,8 @@ __aicore__ constexpr bool IsBasic(const MatmulApiStaticTiling &mmCFG)
 
 __aicore__ constexpr int GetL0PingPong(MatmulConfig mmCFG)
 {
-    return ((mmCFG.basicM * mmCFG.basicK * DB_FACTOR) <= L0_SIZE) &&
-        ((mmCFG.basicK * mmCFG.basicN * DB_FACTOR) <= L0_SIZE) ? 1 : 0;
+    return ((mmCFG.basicM * mmCFG.basicK * Impl::DB_FACTOR) <= Impl::L0_SIZE) &&
+        ((mmCFG.basicK * mmCFG.basicN * Impl::DB_FACTOR) <= Impl::L0_SIZE) ? 1 : 0;
 }
 
 __aicore__ constexpr int GetL0PingPong(const MatmulApiStaticTiling &mmCFG)
@@ -479,13 +480,6 @@ __aicore__ constexpr bool GetDstNzC0Stride(const MatmulApiStaticTiling &mmCFG)
     return GetDstNzC0Stride<isTensorA, isTranspose>(mmCFG.cfg);
 }
 
-__aicore__ inline int Ceil(int num1, int num2)
-{
-    ASCENDC_ASSERT((num2 > 0),
-        { KERNEL_LOG(KERNEL_ERROR, "num2 is %d , which should be larger than 0", num2); });
-    return (num1 + num2 - 1) / num2;
-}
-
 template <typename T>
 __aicore__ inline T CeilT(T num1, T num2)
 {
@@ -694,5 +688,5 @@ __aicore__ inline T CeilAlign(T num1, T num2)
     return Ceil(num1, num2) * num2;
 }
 
-} // namespace Gemm
+} // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/context/context.h b/impl/matmul/modules/context/context.h
index bc4ae71b0ced2511b8a572d0644c6244040627ff..dbbe11dadf97bd918141341a4fdfdecf747ecfb0 100644
--- a/impl/matmul/modules/context/context.h
+++ b/impl/matmul/modules/context/context.h
@@ -17,7 +17,7 @@
 #ifndef IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H
 #define IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -35,5 +35,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif //IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H
\ No newline at end of file
diff --git a/impl/matmul/modules/dfx/dfx_config.h b/impl/matmul/modules/dfx/dfx_config.h
index 33b0efdc10298a237cccb15c698fe63cde06dde5..5690003782498620e5e874faba2221d737695e0c 100644
--- a/impl/matmul/modules/dfx/dfx_config.h
+++ b/impl/matmul/modules/dfx/dfx_config.h
@@ -19,7 +19,7 @@
 #include "handlers/dfx_chain_handler.h"
 #include "dfx_func_info.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 struct DfxConfig {
@@ -28,5 +28,5 @@ struct DfxConfig {
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/dfx/dfx_func_info.h b/impl/matmul/modules/dfx/dfx_func_info.h
index 55cc0e852bd242d2686482c06b4445a516f03cd6..2e6951543b1047999dc31e0eb89f23e9be73c8d1 100644
--- a/impl/matmul/modules/dfx/dfx_func_info.h
+++ b/impl/matmul/modules/dfx/dfx_func_info.h
@@ -16,7 +16,7 @@
 #ifndef MATMUL_DFX_FUNC_INFO_H
 #define MATMUL_DFX_FUNC_INFO_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 struct DfxFuncInfo {
@@ -29,5 +29,5 @@ struct DfxFuncInfo {
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/dfx/dfx_handler.h b/impl/matmul/modules/dfx/dfx_handler.h
index 92ac062b108c5e848546d0bd8b6c30423c39cf03..ddc67ca05fe0b8f3c04054f7587e2668cc876b0c 100644
--- a/impl/matmul/modules/dfx/dfx_handler.h
+++ b/impl/matmul/modules/dfx/dfx_handler.h
@@ -18,7 +18,7 @@
 
 #include "dfx_config.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -40,5 +40,5 @@ struct DfxHandler {
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/dfx/dfx_proxy.h b/impl/matmul/modules/dfx/dfx_proxy.h
index 855da1b8b6385c7a7cad6dd0ab718385215f148e..a243bd2f0d3631ed66519d10a725bffa18d78ccb 100644
--- a/impl/matmul/modules/dfx/dfx_proxy.h
+++ b/impl/matmul/modules/dfx/dfx_proxy.h
@@ -19,7 +19,7 @@
 #include <utility>
 #include "dfx_handler.h"
 
-namespace Gemm {
+namespace AscendC {
 
 template<bool B, class T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
@@ -172,5 +172,5 @@ private:                                                                    \
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/dfx/dfx_registry.h b/impl/matmul/modules/dfx/dfx_registry.h
index 9cea2a72b30f81d36c70a1eeea3d87dfcdfc3a65..5ce6a846ff343b4fd20ee363a4a920939d041d8c 100644
--- a/impl/matmul/modules/dfx/dfx_registry.h
+++ b/impl/matmul/modules/dfx/dfx_registry.h
@@ -19,11 +19,11 @@
 
 #include "dfx_proxy.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
     MATMUL_DFX_PROXY_REGISTER(InputL1Cache, ClearAL1Cache, ClearBL1Cache);
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h
index 6be4698bca2d0da859d3b353317814b54439eed3..b1eb816546299d96d5560556237ea031811b3c7d 100644
--- a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h
+++ b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h
@@ -16,7 +16,7 @@
 #ifndef MATMUL_DFX_CHAIN_HANDLER_H
 #define MATMUL_DFX_CHAIN_HANDLER_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -41,5 +41,5 @@ struct DfxChainHandler {
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/feature_trait/matmul_chip_cap.h b/impl/matmul/modules/feature_trait/matmul_chip_cap.h
index 038c2296a7d27e09747fdee5e32cbd9a319920b9..aed7df70c10be523caabb10d31662201cb9055b4 100644
--- a/impl/matmul/modules/feature_trait/matmul_chip_cap.h
+++ b/impl/matmul/modules/feature_trait/matmul_chip_cap.h
@@ -15,7 +15,7 @@
 #ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H
 #define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -80,5 +80,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_CHIP_CAP_H_
diff --git a/impl/matmul/modules/feature_trait/matmul_feature_trait.h b/impl/matmul/modules/feature_trait/matmul_feature_trait.h
index 25a274389fabcb0b143b5764ee53370ec9d9faea..5531b3e691f0667bca14bed80c9b512403701789 100644
--- a/impl/matmul/modules/feature_trait/matmul_feature_trait.h
+++ b/impl/matmul/modules/feature_trait/matmul_feature_trait.h
@@ -19,7 +19,7 @@
 #include "matmul_chip_cap.h"
 #include "matmul_iter_ctrl_cfg.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -55,5 +55,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_FEATURE_TRAIT_H_
diff --git a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
index 25680363198491039e2af16ab038adaf61a8c603..a092212bfc0e8501ee1a2e3cbb5e0420de6ed66b 100644
--- a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
+++ b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h
@@ -18,7 +18,7 @@
 #include "../../../../lib/matmul/tiling.h"
 #include "../../../../lib/matmul/constant_tiling.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -31,5 +31,5 @@ struct MatmulIterCtrlCfg {
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_ITER_CTRL_CFG_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/iterator/matmul_iterate_controller.h b/impl/matmul/modules/iterator/matmul_iterate_controller.h
index da4cd7c7fe43f42a1d53c997135fae1de3908846..455e39c4c3c847aef72ef3b70ec866432907663b 100644
--- a/impl/matmul/modules/iterator/matmul_iterate_controller.h
+++ b/impl/matmul/modules/iterator/matmul_iterate_controller.h
@@ -20,7 +20,7 @@
 #include "../../matmul_utils.h"
 #include "../feature_trait/matmul_iter_ctrl_cfg.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -150,5 +150,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/matmul_local_workspace.h b/impl/matmul/modules/matmul_local_workspace.h
index fccb8b3b126b74334796ec68e66e593a770d6566..c6461475bf4f9580138aaf6d9790e1e02d53cc5d 100644
--- a/impl/matmul/modules/matmul_local_workspace.h
+++ b/impl/matmul/modules/matmul_local_workspace.h
@@ -17,7 +17,7 @@
 
 #include "matmul_module.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 constexpr int32_t ENVEC_UBUNREUSE_COEFF = 2;
@@ -293,5 +293,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_MATMUL_LOCAL_WORKSPACE_H
\ No newline at end of file
diff --git a/impl/matmul/modules/matmul_module.h b/impl/matmul/modules/matmul_module.h
index db9968eb1ef2f54b00b25b80973147d99038ddd5..f3226e9752125b73d295ff0d89edbde3432688e6 100644
--- a/impl/matmul/modules/matmul_module.h
+++ b/impl/matmul/modules/matmul_module.h
@@ -19,14 +19,14 @@
 #include "dfx/dfx_config.h"
 
 /*                                        MatmulModuleBase                   */
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <class... _Types>
 using void_t = void;
 
 // if user define self-implement module, but inherited from base module implemented in matmul,
-// child module shoud declare : using BASE_MODULE = Gemm::XXXModuleName<xxx, xxx>;
+// child module shoud declare : using BASE_MODULE = AscendC::XXXModuleName<xxx, xxx>;
 struct MatmulNullBase {};
 
 template <typename MODULE, typename = void>
@@ -41,7 +41,7 @@ struct MatmulModuleBase<MODULE, void_t<typename MODULE::BASE_MODULE>> {
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 /*                                        MatmulImplBase                            */
 #define MATMUL_IMPL__ IMPL
 #define MATMUL_POLICY__ POLICY
@@ -56,23 +56,23 @@ struct MatmulModuleBase<MODULE, void_t<typename MODULE::BASE_MODULE>> {
 (static_cast<const typename MATMUL_IMPL__::__VA_ARGS__*>(MATMUL_CAST_TO_CONST_IMPL()))
 
 #define MATMUL_CAST_TO_PROXY_OF(NAME)                           \
-typename Gemm::Impl::Detail::DfxProxy<MATMUL_IMPL__, typename MATMUL_IMPL__::NAME> (*MATMUL_CAST_TO_IMPL_OF(NAME))
+typename AscendC::Impl::Detail::DfxProxy<MATMUL_IMPL__, typename MATMUL_IMPL__::NAME> (*MATMUL_CAST_TO_IMPL_OF(NAME))
 
 #define MATMUL_CAST_TO_CONST_PROXY_OF(NAME)                     \
-typename Gemm::Impl::Detail::DfxProxy<const MATMUL_IMPL__, typename MATMUL_IMPL__::NAME> (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME))
+typename AscendC::Impl::Detail::DfxProxy<const MATMUL_IMPL__, typename MATMUL_IMPL__::NAME> (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME))
 
 #define MATMUL_MODULE(NAME)  cast_to_##NAME()
 
 #define MATMUL_USE_MODULE(NAME)                                         \
 __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) {        \
-    if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) {                                  \
+    if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) {                                  \
         return MATMUL_CAST_TO_PROXY_OF(NAME);                           \
     } else {                                                            \
         return MATMUL_CAST_TO_IMPL_OF(NAME);                            \
     }                                                                   \
 }                                                                       \
 __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const {  \
-    if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) {                                  \
+    if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) {                                  \
         return MATMUL_CAST_TO_CONST_PROXY_OF(NAME);                     \
     } else {                                                            \
         return MATMUL_CAST_TO_CONST_IMPL_OF(NAME);                      \
@@ -81,14 +81,14 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const {  \
 
 #define MATMUL_USE_MODULE_ON(NAME, ...)                                  \
 __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) {         \
-    if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) {                                   \
+    if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) {                                   \
         return MATMUL_CAST_TO_PROXY_OF(template NAME<__VA_ARGS__>);      \
     } else {                                                             \
         return MATMUL_CAST_TO_IMPL_OF(template NAME<__VA_ARGS__>);       \
     }                                                                    \
 }                                                                        \
 __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const {   \
-    if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) {                                   \
+    if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) {                                   \
         return MATMUL_CAST_TO_CONST_PROXY_OF(template NAME<__VA_ARGS__>);\
     } else {                                                             \
         return MATMUL_CAST_TO_CONST_IMPL_OF(template NAME<__VA_ARGS__>); \
@@ -100,7 +100,7 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const {   \
 
 #define MATMUL_POLICY_DEFAULT_OF(DEFAULT)      \
 template <const auto& = MM_CFG, typename ...>  \
-        class MATMUL_POLICY = Gemm::Impl::Detail::DEFAULT
+        class MATMUL_POLICY = AscendC::Impl::Detail::DEFAULT
 
 #define MATMUL_POLICY_TEMPLATE_OF(NAME)        \
 template <const auto& = MM_CFG, typename ...> class NAME
@@ -118,14 +118,14 @@ MATMUL_POLICY_TEMPLATE<MM_CFG, MATMUL_IMPL_TYPE, A_TYPE, B_TYPE, C_TYPE, BIAS_TY
 
 #define MATMUL_ALLOW_USING(NAME)                         \
 using NAME = typename MATMUL_MODULE_IN_POLICY(NAME);     \
-friend typename Gemm::Impl::Detail::MatmulModuleBase<NAME>::type;            \
+friend typename AscendC::Impl::Detail::MatmulModuleBase<NAME>::type;            \
 friend NAME
 
 #define MATMUL_ALLOW_USING_TEMPLATE(NAME, ...)                \
 using NAME = typename MATMUL_MODULE_IN_POLICY(template NAME<__VA_ARGS__>)
 
 /*                                        Matmul Private Module                           */
-#define MATMUL_PRIVATE_TEMPLATE Gemm::Impl::Detail::MatmulPrivateModules
+#define MATMUL_PRIVATE_TEMPLATE AscendC::Impl::Detail::MatmulPrivateModules
 #define MATMUL_MODULE_IN_PRIVATE(...)                 \
 MATMUL_PRIVATE_TEMPLATE<MM_CFG, MATMUL_IMPL_TYPE, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::__VA_ARGS__
 
diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h
index 3f89a0f33d378296a4b678d5a0c12c382408e0e5..c37d87b74211e7cf0d6fb89f68136342518410fa 100644
--- a/impl/matmul/modules/matmul_param.h
+++ b/impl/matmul/modules/matmul_param.h
@@ -24,7 +24,7 @@
 #include "matmul_type_def.h"
 #include "resource/cube_in_buffer/global_cache.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /* **************************************************************************************************
@@ -454,5 +454,5 @@ struct MatmulParams<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, GetMatmulVersion(
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif
diff --git a/impl/matmul/modules/matmul_policy.h b/impl/matmul/modules/matmul_policy.h
index 139dfb82f82ff57f9b2b72d77fc3c931b1c38f57..f3c4762aa359d47f5f314ef6f1082336f63da610 100644
--- a/impl/matmul/modules/matmul_policy.h
+++ b/impl/matmul/modules/matmul_policy.h
@@ -21,7 +21,7 @@
 #include "stage/copy_cube_in/copy_cube_in.h"
 #include "context/context.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -35,14 +35,14 @@ struct MatmulPolicy
 public:
     using L0cT = typename GetDstType<typename A_TYPE::T>::Type;
     using Context = MatmulContext<IMPL, MM_CFG>;
-    using CubeOutBuffer = Gemm::Impl::Detail::CubeOutBuffer<IMPL, L0cT, MM_CFG>;
-    using CopyCubeOut = Gemm::Impl::Detail::CopyCubeOut<IMPL, A_TYPE, B_TYPE, C_TYPE, MM_CFG>;
-    using CopyCubeInA = Gemm::Impl::Detail::CopyCubeIn<IMPL, MatmulInputAType<A_TYPE, typename A_TYPE::T>, MM_CFG>;
+    using CubeOutBuffer = AscendC::Impl::Detail::CubeOutBuffer<IMPL, L0cT, MM_CFG>;
+    using CopyCubeOut = AscendC::Impl::Detail::CopyCubeOut<IMPL, A_TYPE, B_TYPE, C_TYPE, MM_CFG>;
+    using CopyCubeInA = AscendC::Impl::Detail::CopyCubeIn<IMPL, MatmulInputAType<A_TYPE, typename A_TYPE::T>, MM_CFG>;
     using CopyCubeInB = CopyCubeIn<IMPL, MatmulInputBType<B_TYPE, typename A_TYPE::T>, MM_CFG>;
     using CubeInBufferA = CubeInBuffer<IMPL, MatmulInputAType<A_TYPE, typename A_TYPE::T>, MM_CFG>;
     using CubeInBufferB = CubeInBuffer<IMPL, MatmulInputBType<B_TYPE, typename A_TYPE::T>, MM_CFG>;
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_POLICY_H_
diff --git a/impl/matmul/modules/matmul_private_modules.h b/impl/matmul/modules/matmul_private_modules.h
index 6bc0f5674863a26c3eed38cf733f152c91205ce2..a21dc2161d8b29379ce1066534f36b00939f8509 100644
--- a/impl/matmul/modules/matmul_private_modules.h
+++ b/impl/matmul/modules/matmul_private_modules.h
@@ -29,7 +29,7 @@
 #include "param/matmul_usr_define_info.h"
 #include "iterator/matmul_iterate_controller.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -39,8 +39,8 @@ struct MatmulPrivateModules {
     using CopyCubeInParamsB = CopyCubeInParams<IMPL, MM_CFG, MatmulInputBType<B_TYPE, typename A_TYPE::T>>;
     using MatmulTensorInfoA = MatmulTensorInfo<IMPL, MM_CFG, MatmulInputAType<A_TYPE, typename A_TYPE::T>>;
     using MatmulTensorInfoB = MatmulTensorInfo<IMPL, MM_CFG, MatmulInputBType<B_TYPE, typename A_TYPE::T>>;
-    using MatmulSubBlockInfo = Gemm::Impl::Detail::MatmulSubBlockInfo<IMPL, MM_CFG>;
-    using MatmulShapeTiling = Gemm::Impl::Detail::MatmulShapeTiling<IMPL, MM_CFG>;
+    using MatmulSubBlockInfo = AscendC::Impl::Detail::MatmulSubBlockInfo<IMPL, MM_CFG>;
+    using MatmulShapeTiling = AscendC::Impl::Detail::MatmulShapeTiling<IMPL, MM_CFG>;
     using DataCopyUtilsA = DataCopyWrapper<IMPL, MM_CFG, MatmulInputAType<A_TYPE, typename A_TYPE::T>>;
     using DataCopyUtilsB = DataCopyWrapper<IMPL, MM_CFG, MatmulInputBType<B_TYPE, typename A_TYPE::T>>;
     using BatchDataCopyUtilsA = BatchDataCopyWrapper<IMPL, MM_CFG, MatmulInputAType<A_TYPE, typename A_TYPE::T>>;
@@ -50,14 +50,14 @@ struct MatmulPrivateModules {
     using BatchCopyCubeInA = BatchCopyCubeIn<IMPL, MM_CFG, MatmulInputAType<A_TYPE, typename A_TYPE::T>>;
     using BatchCopyCubeInB = BatchCopyCubeIn<IMPL, MM_CFG, MatmulInputBType<B_TYPE, typename A_TYPE::T>>;
     using IterateController =
-        Gemm::Impl::Detail::MatmulIterateController<IMPL, A_TYPE, B_TYPE, MM_CFG,
-        Gemm::Impl::Detail::MatmulFeatureTrait<MM_CFG>::iterCtrlCfg>;
+        AscendC::Impl::Detail::MatmulIterateController<IMPL, A_TYPE, B_TYPE, MM_CFG,
+        AscendC::Impl::Detail::MatmulFeatureTrait<MM_CFG>::iterCtrlCfg>;
     using LocalWorkspace = MatmulLocalWorkspace<IMPL, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG>;
-    using MatmulShapeInfo = Gemm::Impl::Detail::MatmulShapeInfo<IMPL, MM_CFG>;
-    using MatmulQuantProcessor = Gemm::Impl::Detail::MatmulQuantProcessor<IMPL, A_TYPE, C_TYPE, MM_CFG>;
-    using MatmulUserDefineInfo = Gemm::Impl::Detail::MatmulUserDefineInfo<IMPL, MM_CFG>;
+    using MatmulShapeInfo = AscendC::Impl::Detail::MatmulShapeInfo<IMPL, MM_CFG>;
+    using MatmulQuantProcessor = AscendC::Impl::Detail::MatmulQuantProcessor<IMPL, A_TYPE, C_TYPE, MM_CFG>;
+    using MatmulUserDefineInfo = AscendC::Impl::Detail::MatmulUserDefineInfo<IMPL, MM_CFG>;
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_PRIVATE_MODULES_H_
diff --git a/impl/matmul/modules/matmul_type_def.h b/impl/matmul/modules/matmul_type_def.h
index 60b29b4ecf08dafe8e3189ff9cc29a081d7c1059..f157549326e1a271004f1d05bbe1ae2deafb6f57 100644
--- a/impl/matmul/modules/matmul_type_def.h
+++ b/impl/matmul/modules/matmul_type_def.h
@@ -17,7 +17,7 @@
 
 #include "lib/matmul/tiling.h"
 
-namespace Gemm {
+namespace AscendC {
 
 enum class InputTypeTag : uint8_t {
     A = 0,
@@ -54,5 +54,5 @@ struct MatmulInputCType : INPUT_TYPE {
     constexpr static InputTypeTag TAG = InputTypeTag::C;
 };
 
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _MATMUL_TYPE_DEF_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/param/matmul_shape_info.h b/impl/matmul/modules/param/matmul_shape_info.h
index 91ae26d0567d06afad7b49b8d7a9187d7148fba5..bc8cb9e01b02f92ba740ce961d14fbe0e1a758c9 100644
--- a/impl/matmul/modules/param/matmul_shape_info.h
+++ b/impl/matmul/modules/param/matmul_shape_info.h
@@ -18,7 +18,7 @@
 
 #include "../matmul_module.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG>
@@ -141,5 +141,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_INFO_H
diff --git a/impl/matmul/modules/param/matmul_shape_tiling.h b/impl/matmul/modules/param/matmul_shape_tiling.h
index 4f1f9855e4597ce5182822511f39f430819ba054..a31707b382e70890cfccd6a1b3ee49eec0f9c625 100644
--- a/impl/matmul/modules/param/matmul_shape_tiling.h
+++ b/impl/matmul/modules/param/matmul_shape_tiling.h
@@ -18,7 +18,7 @@
 
 #include "../matmul_module.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG>
@@ -31,5 +31,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_TILING_H
diff --git a/impl/matmul/modules/param/matmul_subblock_info.h b/impl/matmul/modules/param/matmul_subblock_info.h
index 88bf9426b839fd953624f3828f9d91620d5b87cc..fd7d02d06ced0326ee28b9757cade94b41b0d8ac 100644
--- a/impl/matmul/modules/param/matmul_subblock_info.h
+++ b/impl/matmul/modules/param/matmul_subblock_info.h
@@ -18,7 +18,7 @@
 
 #include "../matmul_module.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG>
@@ -36,5 +36,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SUBBLOCK_INFO_H
diff --git a/impl/matmul/modules/param/matmul_tensor_info.h b/impl/matmul/modules/param/matmul_tensor_info.h
index 89eaa007cad6f2231ba70dd66abaec5ab365c801..0361addf13d2b259ba5fe383d4309fc7273f5dd2 100644
--- a/impl/matmul/modules/param/matmul_tensor_info.h
+++ b/impl/matmul/modules/param/matmul_tensor_info.h
@@ -18,7 +18,7 @@
 
 #include "../matmul_module.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG, class INPUT_TYPE, typename = void>
@@ -106,5 +106,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_TENSOR_INFO_H
diff --git a/impl/matmul/modules/param/matmul_usr_define_info.h b/impl/matmul/modules/param/matmul_usr_define_info.h
index ef6610d5514b1fedd6fd0b5bdf31fbe33b2a9b2b..3571c5fe4682dcba9aacca11b8a8c429a984849a 100644
--- a/impl/matmul/modules/param/matmul_usr_define_info.h
+++ b/impl/matmul/modules/param/matmul_usr_define_info.h
@@ -15,7 +15,7 @@
 #ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_
 #define IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG>
@@ -33,5 +33,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h
index 2a4eb1f93d5242a8c74dcbd4ca73c8c0f0ff7587..a136f40b98766c761033019749d8cf031d0e656a 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h
@@ -18,7 +18,7 @@
 #include "cube_in_buffer_intf.h"
 #include "../../param/matmul_shape_tiling.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 constexpr int32_t BANK_CONFLICT_SIZE = 512;
@@ -38,7 +38,7 @@ public:
     __aicore__ inline ~CubeInBuffer() {}
     __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum)
     {
-        int32_t matrixByteSize =  baseBlockSize * Gemm::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
+        int32_t matrixByteSize =  baseBlockSize * AscendC::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
         int32_t stepSize = GetTotalCacheNum();
         cacheFactor_ = (cacheNum / stepSize - 1) & 1;
         int32_t queDepth = cacheFactor_ == 0 ? SINGLE_QUE : DOUBLE_QUE;
@@ -245,5 +245,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h
index 1b8c4b2b49d6f5d27c992ef59a854d6fead6f205..b0d3eb0335361d8fbdfe52163803ba2ffdc82a68 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h
@@ -18,7 +18,7 @@
 #include "cube_in_buffer_intf.h"
 #include "global_cache.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -39,7 +39,7 @@ public:
         baseBlockSize_ = baseBlockSize;
         groupCache0_.Init();
         groupCache1_.Init();
-        int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
+        int32_t matrixByteSize = baseBlockSize_ * AscendC::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
         groupCache0_.InitBuffer(matrixByteSize * cacheNum);
         groupCache1_.InitBuffer(matrixByteSize * cacheNum);
     }
@@ -150,5 +150,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
index 6dba5339344b3634cafa28c3db584f46db6c9d35..a24220281994c6412a74a4062ca0c156ce7b401c 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h
@@ -19,7 +19,7 @@
 #include "../../../matmul_utils.h"
 #include "cube_in_buffer_utils.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -116,5 +116,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_INTF_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h
index 713e441a724ba521eb5657944497c33d6bd37ae4..6cd296545ce790ad2a3a6ad969cda8e03c7f9812 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h
@@ -17,7 +17,7 @@
 
 #include "cube_in_buffer_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -37,7 +37,7 @@ public:
     __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum)
     {
         baseBlockSize_ = baseBlockSize;
-        int32_t matrixByteSize =  baseBlockSize_ * Gemm::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
+        int32_t matrixByteSize =  baseBlockSize_ * AscendC::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
         int32_t reduceAxisCnt = MATMUL_MODULE(MatmulShapeInfo)->GetKIter();
         auto tpipePtr = GetTPipePtr();
         if (cacheNum > DB_FACTOR) {
@@ -179,5 +179,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_NORMAL_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
index 73f6ae4ff1a808eda3e1430366114e69b2bf860e..449f47739b8682e0b24634f7303eee62d659724a 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h
@@ -17,7 +17,7 @@
 
 #include "cube_in_buffer_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -35,7 +35,7 @@ public:
     __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum)
     {
         (void) cacheNum;
-        int32_t matrixByteSize = baseBlockSize * Gemm::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
+        int32_t matrixByteSize = baseBlockSize * AscendC::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
         GetTPipePtr()->InitBuffer(qid_, SINGLE_QUE, matrixByteSize);
     }
 
@@ -100,5 +100,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_SINGLE_BUFFER_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h
index b094afb683d7a9d2024990d35e5c47f570dfdb1f..d2cb23c63511352f7ce8980314ed31f49f242834 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h
@@ -18,7 +18,7 @@
 #include "cube_in_buffer_intf.h"
 #include "global_cache.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -37,7 +37,7 @@ public:
     __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum)
     {
         baseBlockSize_ = baseBlockSize;
-        int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
+        int32_t matrixByteSize = baseBlockSize_ * AscendC::GetBitSize<TransT>() / ONE_BYTE_BIT_SIZE;
         GetGlobalCachePtr()->InitBuffer(matrixByteSize * cacheNum);
     }
 
@@ -128,5 +128,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H_
diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h
index e2d8d579ec8d9cab30dd32e83a321419eb5b51aa..aba48174cc96f65d90592d658e35ca5871cf2cf4 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h
@@ -16,7 +16,7 @@
 
 #include "../../matmul_type_def.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -92,5 +92,5 @@ __aicore__ inline constexpr CubeInBufferType GetCubeInBufferType()
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _CUBE_IN_BUFFER_UTILS_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h
index 5f721412b53cc8baf91673fb6d543996d2e03a6f..ccaf0d94ff1c9e764f7768f33ea648f8c8d4ad64 100644
--- a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h
+++ b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h
@@ -14,20 +14,20 @@
 #ifndef IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_
 #define IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 class GlobalCache;
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
-__BLOCK_LOCAL__ __inline__ Gemm::Impl::Detail::GlobalCache* gL1Cache;
-__aicore__ inline Gemm::Impl::Detail::GlobalCache* GetGlobalCachePtr()
+}  // namespace AscendC
+__BLOCK_LOCAL__ __inline__ AscendC::Impl::Detail::GlobalCache* gL1Cache;
+__aicore__ inline AscendC::Impl::Detail::GlobalCache* GetGlobalCachePtr()
 {
     return gL1Cache;
 }
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -158,5 +158,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _GLOBAL_CACHE_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h
index a85f717b299076331496e128216e0287bf36cf73..62f2b2796a7e8cb86a73317319cb2151fc12a27b 100644
--- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h
+++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h
@@ -14,7 +14,7 @@
 #ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H
 #define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 enum class UNIT_FLAG_CTRL : uint8_t {
@@ -43,5 +43,5 @@ struct L0cType<false> {
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H
\ No newline at end of file
diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h
index 2c7637adb76adcec3fe67639c50e986d945b20bb..222db563dbccc1e6c791ba2f82357f6d296900d7 100644
--- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h
+++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h
@@ -18,7 +18,7 @@
 #include "../../../matmul_utils.h"
 #include "lib/matmul/tiling.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -85,5 +85,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H
\ No newline at end of file
diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h
index 6051bb610d004bbe7b55852bea73e9638c9321a1..f81a329575287b33fc4b471a1483a394065eaeb0 100644
--- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h
+++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h
@@ -19,7 +19,7 @@
 #include "lib/matmul/tiling.h"
 #include "../../feature_trait/matmul_feature_trait.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -78,5 +78,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h
index 0625a308a7f43478bb72aee7114526b303d2e385..4f4726597ae5cbdc875852a10eb5093a96a89272 100644
--- a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h
+++ b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h
@@ -16,7 +16,7 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H
 #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <class TRANS_T>
@@ -64,5 +64,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif //_ASYNC_TENSOR_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h
index 5226ee1340594582830ca2316912b859c7606b65..5bde8c9014a8b344d3a98fd45558b1c20887dc07 100644
--- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h
+++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h
@@ -21,7 +21,7 @@
 #include "../../../resource/cube_in_buffer/cube_in_buffer.h"
 #include "../copy_cube_in_params.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 // Specialized Template Class of Batch Matmul CopyIn
@@ -444,5 +444,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h
index 0cc2bc69643c534c98af5b27d9f23299ee379a48..012b98237825e634662f8fdc6e3ae10d3c101719 100644
--- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h
+++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h
@@ -15,10 +15,10 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_
 #define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
-using namespace AscendC;
+
 
 template <typename IMPL, const auto &MM_CFG, class INPUT_TYPE, typename = void>
 class BatchCopyCubeIn
@@ -90,5 +90,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _BATCH_COPY_CUBE_IN_INTF_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
index 674c28baaef914607c4e7daeab35427e6bdd8da5..a6342074ddc078b4bd455e54ffa98a393c00d4f8 100644
--- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
+++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h
@@ -16,7 +16,7 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H
 #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto &MM_CFG, class INPUT_TYPE, typename = void>
@@ -85,5 +85,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h
index a8322e3cc4a723cc97192c0d6663d55997d47597..6fe09993d0a41bb8e6479ef13d24cc2cc066d397 100644
--- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h
+++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h
@@ -21,7 +21,7 @@
 #include "../../../resource/cube_in_buffer/cube_in_buffer.h"
 #include "../copy_cube_in_params.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 // Specialized Template Class of Batch Matmul CopyIn
@@ -470,5 +470,5 @@ private:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_USING_UB_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h
index 0515326ac069f748523cfcb0ed71493ee203fa08..8ab82a4abd350d3f78fe1a9562cd4c30c4b83c4e 100644
--- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h
+++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h
@@ -20,12 +20,10 @@
 #include "../copy_cube_in_utils.h"
 #include "../copy_cube_in_params.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
-using namespace AscendC;
-
 template <typename IMPL, const auto& MM_CFG, class INPUT_TYPE>
 class BatchDataCopyWrapper
 {
@@ -817,5 +815,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_BATCH_DATA_COPY_WRAPPER_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h
index 72a70024fb1bf18dfe58db10b1ad15f2bcde159a..62304feef856708a96693c788db6f97baa8217ed 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h
@@ -20,7 +20,7 @@
 #include "data_copy_wrapper.h"
 #include "copy_cube_in_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -77,5 +77,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _COPY_CUBE_IN_FROM_L1_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
index 4af9fe74d1ae1d5e300ac39c0ed9a95dcc610f9d..04d4065fc192079397d3d3bf64f430fe74be2ba2 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h
@@ -14,10 +14,10 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_
 #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
-using namespace AscendC;
+
 /*
     CopyCubeIn is considered entirely experimental.
     We retain the freedom to make incompatible changes, but do not guarantee the stability.
@@ -95,5 +95,5 @@ public:
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _COPY_CUBE_IN_INTF_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h
index 0e023261d427868f74235b2b1bd1eb261a204518..25462b44222b9ac10e244cc757d33457b2cfb846 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h
@@ -21,7 +21,7 @@
 #include "copy_cube_in_intf.h"
 #include "async_tensor.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -156,5 +156,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _COPY_CUBE_IN_MDL_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h
index 27024b41c584fd51d4697d8c806a84b7bf5e7397..333aa1ee44a4530ed49bc80ef075673737ecddbd 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h
@@ -21,7 +21,7 @@
 #include "copy_cube_in_intf.h"
 #include "async_tensor.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -224,5 +224,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _COPY_CUBE_IN_NORM_H_
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h
index 8a2bc529fd0e045c5e2a4ab0a528d70b836a3b5d..d690a9f9fd3cdb6a36bc334147edf2488b9b67e1 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h
@@ -18,7 +18,7 @@
 
 #include "../../param/matmul_shape_tiling.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 template <typename IMPL, const auto& MM_CFG, class INPUT_TYPE, typename = void>
@@ -465,5 +465,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_PARAMS_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h
index 31fdae655f464f93e8c8f09fd0f304e2d9c3667b..3ef607960702c0d01b3bf1e77eb6ad8d531377ee 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h
@@ -20,10 +20,10 @@
 #include "../../matmul_param.h"
 #include "copy_cube_in_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
-using namespace AscendC;
+
 
 constexpr int32_t FIRST_16BIT_OFFSET_MM_API = 16;
 constexpr int32_t SECOND_16BIT_OFFSET_MM_API = 32;
@@ -1822,5 +1822,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H
diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h
index 95290033ef266a78923b4e636b9fa8f0dc4a55c7..de8ed33d42bfe41d0e58b37343a372a396681c2a 100644
--- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h
+++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h
@@ -16,7 +16,7 @@
 
 #include "../../feature_trait/matmul_feature_trait.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 enum class CopyCubeInType : uint8_t {
@@ -64,5 +64,5 @@ __aicore__ inline constexpr CopyCubeInType GetCopyCubeInType()
 
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // _COPY_CUBE_IN_UTILS_H_
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h
index 4e7958c6fbf4d21948b6545ed801312997bd19db..558e0e94c9e08c9d1a04a4c23a80b542173091e2 100644
--- a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h
+++ b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h
@@ -20,12 +20,10 @@
 #include "copy_cube_in_utils.h"
 #include "copy_cube_in_params.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
-using namespace AscendC;
-
 template<typename IMPL, const auto& MM_CFG, class INPUT_TYPE>
 class DataCopyWrapper {
     using TransT = typename INPUT_TYPE::TRANS_T;
@@ -524,5 +522,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_DATA_COPY_WRAPPER_H
diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h
index 248f6ec923454404f2fa29467a0751e3f1ff184b..90fa43c16fc5d6ec1b812be467574bf28dd3319c 100644
--- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h
+++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h
@@ -20,7 +20,7 @@
 #include "../../matmul_param.h"
 #include "copy_cube_out_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -1054,5 +1054,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H
diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h
index 4c0ae19e4f5c42d20757508eb8264fa4137935ce..81786a86dcab6eb4e819c9c5164355835688ea6c 100644
--- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h
+++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h
@@ -24,7 +24,7 @@
 #include "../quant/quant_processor_utils.h"
 #include "copy_cube_out_utils.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -228,5 +228,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_FIXPIPE_H
diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
index e65dad088bf3653f044b76b12ac4e1ef6f07ac09..d0cf6c6a5d27f28f3e61ed06b3589adb8d7d21e2 100644
--- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
+++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h
@@ -17,7 +17,7 @@
 #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H
 
 #include "../../feature_trait/matmul_chip_cap.h"
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 /*
@@ -73,5 +73,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H
diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h
index eca08a35ed177dee0ec2d4d4d1c7124523145a6b..1cf4c4188ae4bb3a8c203e09282e568a0548b8e4 100644
--- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h
+++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h
@@ -16,7 +16,7 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H
 #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -128,5 +128,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h
index 952e837e89af95dd0cbb8970cafc9af4d38aeea8..170dfe07938e28e2eca4567e372bcdd1da7b6cbe 100644
--- a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h
+++ b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h
@@ -20,7 +20,7 @@
 #include "../../matmul_param.h"
 #include "quant_processor_intf.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -127,5 +127,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_DATACOPY_H
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h
index 84e0b74e717c65eb6e7937dac50c8b82a7a076d7..1233a8b5a00f706133dcbe8a2c9b7297b44fec2c 100644
--- a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h
+++ b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h
@@ -21,7 +21,7 @@
 #include "quant_processor_intf.h"
 #include "quant_processor_utils.h"
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -167,5 +167,5 @@ private:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_FIXPIPE_H
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/quant/quant_processor_intf.h b/impl/matmul/modules/stage/quant/quant_processor_intf.h
index 852f32769ce5bcdc78f25f33cf9095196469595c..4a9be58898a699b042a39d0c3258b7f55cb20720 100644
--- a/impl/matmul/modules/stage/quant/quant_processor_intf.h
+++ b/impl/matmul/modules/stage/quant/quant_processor_intf.h
@@ -17,7 +17,7 @@
 #define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H
 
 #include "../../feature_trait/matmul_chip_cap.h"
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
 
@@ -99,5 +99,5 @@ public:
 };
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H
\ No newline at end of file
diff --git a/impl/matmul/modules/stage/quant/quant_processor_utils.h b/impl/matmul/modules/stage/quant/quant_processor_utils.h
index 5324a1aebe56c3fdbaabfa4f27fd4b865c1a8a2c..2a831512115c1f817ab8d913cb367e3b088a2d54 100644
--- a/impl/matmul/modules/stage/quant/quant_processor_utils.h
+++ b/impl/matmul/modules/stage/quant/quant_processor_utils.h
@@ -16,7 +16,7 @@
 #ifndef IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H
 #define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H
 
-namespace Gemm {
+namespace AscendC {
 namespace Impl {
 namespace Detail {
     
@@ -36,5 +36,5 @@ __aicore__ inline constexpr static bool IsQuantSenario()
 }
 }  // namespace Detail
 }  // namespace Impl
-}  // namespace Gemm
+}  // namespace AscendC
 #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H
\ No newline at end of file
diff --git a/impl/normalization/layernorm/layernorm_common_basic_impl.h b/impl/normalization/layernorm/layernorm_common_basic_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdd6e297d42340b0773cd7ec74638b793d196a62
--- /dev/null
+++ b/impl/normalization/layernorm/layernorm_common_basic_impl.h
@@ -0,0 +1,389 @@
+/* Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file layernorm_common_basic_impl.h
+ * \brief
+ */
+#ifndef IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H
+#define IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H
+namespace AscendC {
+constexpr uint32_t MASK_LOW_6BITS = 0x3f;
+constexpr uint32_t MASK_HIGH_26BITS = 0xFFFFFFC0;
+
+struct LayerNormPara {
+    uint32_t aLength;
+    uint32_t rLength;
+    uint32_t rLengthWithPadding;
+};
+
+struct LayerNormConfig {
+    bool isNoBeta = false;
+    bool isNoGamma = false;
+    bool isOnlyOutput = false;
+};
+
+struct WelfordUpdateParam {
+    uint32_t rnLength;
+    uint32_t abLength;
+    uint32_t abComputeLength;
+    float nRec;
+};
+struct WelfordUpdateConfig {
+    __aicore__ constexpr WelfordUpdateConfig(const bool isInplaceIn): isInplace(isInplaceIn) {}
+    bool isInplace = false;
+};
+
+__aicore__ constexpr LayerNormConfig GetLayerNormNormalConfig()
+{
+    return {.isNoBeta = false, .isNoGamma = false, .isOnlyOutput = false};
+}
+constexpr LayerNormConfig LNCFG_NORM = GetLayerNormNormalConfig();
+
+template <bool isRelocate = true, bool isTransposeDst = false>
+__aicore__ inline void LayerNormReduceSumImpl(const LocalTensor<float>& dstMVTmp, const LocalTensor<float>& dst,
+    const LocalTensor<float>& src, const uint32_t bsLength, const uint32_t hLength)
+{
+    ResetMask();
+    SetMaskNorm();
+    // Contract the horizontal axis to one repeat length 64 (2^6)
+    constexpr uint32_t rightShiftSix = 6;
+    if (hLength > ONE_REPEAT_FLOAT_SIZE) {
+        uint32_t addRepeatTime = (hLength >> rightShiftSix) - 1;
+        uint32_t addTailNumber = (hLength & MASK_LOW_6BITS);
+        if ((hLength & MASK_LOW_6BITS) == 0) {
+            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
+                LocalTensor<float> dstTmp = src[i];
+                LocalTensor<float> srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE];
+                Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime,
+                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
+                PipeBarrier<PIPE_V>();
+            }
+        } else if (addRepeatTime > 0) {
+            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
+                LocalTensor<float> dstTmp = src[i];
+                LocalTensor<float> srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE];
+                LocalTensor<float> srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)];
+                Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime,
+                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
+                PipeBarrier<PIPE_V>();
+                Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1,
+                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
+                PipeBarrier<PIPE_V>();
+            }
+        } else {
+            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
+                LocalTensor<float> dstTmp = src[i];
+                LocalTensor<float> srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)];
+                Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1,
+                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
+                PipeBarrier<PIPE_V>();
+            }
+        }
+    }
+
+    uint32_t repeatTime = bsLength;
+    uint32_t cursorSrc = 0;
+    uint32_t wholeReduceSumHLength = (hLength > ONE_REPEAT_FLOAT_SIZE) ? ONE_REPEAT_FLOAT_SIZE : hLength;
+    constexpr uint32_t rightShiftThree = 3;
+    const uint32_t reduceSumSrcRepeatStride = hLength >> rightShiftThree;
+
+    while (repeatTime >= MAX_REPEAT_TIMES) {
+        LocalTensor<float> srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength];
+        LocalTensor<float> dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength];
+        if constexpr (isRelocate) {
+            WholeReduceSum<float>(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength,
+                MAX_REPEAT_TIMES, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
+        }
+        WholeReduceSum<float>(dstTmp, srcTmp, wholeReduceSumHLength, MAX_REPEAT_TIMES, hLength, DEFAULT_BLK_STRIDE,
+            reduceSumSrcRepeatStride);
+        PipeBarrier<PIPE_V>();
+        repeatTime -= MAX_REPEAT_TIMES;
+        ++cursorSrc;
+    }
+
+    uint32_t reduceSumSrcRepeatTimeTail = bsLength - cursorSrc * MAX_REPEAT_TIMES;
+    if (reduceSumSrcRepeatTimeTail > 0) {
+        LocalTensor<float> srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength];
+        LocalTensor<float> dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength];
+        if constexpr (isRelocate) {
+            WholeReduceSum<float>(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength,
+                reduceSumSrcRepeatTimeTail, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
+        }
+        WholeReduceSum<float>(dstTmp, srcTmp, wholeReduceSumHLength, reduceSumSrcRepeatTimeTail, hLength,
+            DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
+        PipeBarrier<PIPE_V>();
+    }
+
+    SetMaskCount();
+}
+
+constexpr WelfordUpdateConfig WFUPDATE_DEFAULT_CFG = {false};
+__aicore__ inline void BroadcastLastDim(const LocalTensor<float>& dst, const LocalTensor<float>& src,
+    const uint32_t bsLength, const uint32_t hLength)
+{
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, hLength);
+
+    SetCmpMask<float>(src);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<int16_t> maskLocal = src.ReinterpretCast<int16_t>();
+
+    const UnaryRepeatParams unaryParams;
+    Muls<int16_t, false>(maskLocal, maskLocal, 0, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    const BinaryRepeatParams binaryParams;
+    Select<float, int16_t>(dst, maskLocal, dst, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+
+    for (uint32_t i = 1; i < bsLength; i++) {
+        SetCmpMask<float>(src[i * hLength]);
+        PipeBarrier<PIPE_V>();
+
+        Select<float, int16_t>(dst[i * hLength], maskLocal, dst, 1, binaryParams);
+        PipeBarrier<PIPE_V>();
+    }
+}
+
+__aicore__ inline void DuplicateMulImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src0,
+    const LocalTensor<float>& src1, const uint32_t bsLength, const uint32_t hLength)
+{
+    const BinaryRepeatParams binaryParams;
+    for (uint32_t i = 0; i < bsLength; i++) {
+        Mul<float, false>(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams);
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void DuplicateAddImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src0,
+    const LocalTensor<float>& src1, const uint32_t bsLength, const uint32_t hLength)
+{
+    const BinaryRepeatParams binaryParams;
+    for (uint32_t i = 0; i < bsLength; i++) {
+        Add<float, false>(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams);
+    }
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T, bool isReuseSource = false>
+__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor<T>& inputX, const LocalTensor<T>& outputMean,
+    const LocalTensor<T>& outputVariance, const LocalTensor<float>& stackBuffer, const LayerNormTiling& tiling,
+    LayerNormParams<float>& params)
+{
+    params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+    params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+    params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+    params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos];
+    params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos];
+    ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+            tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+    });
+    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tiling.tmpBufSize is (%d)",
+            stackBuffer.GetSize(), tiling.tmpBufSize);
+    });
+}
+
+template <>
+__aicore__ inline void GetLayerNormNDTensorInfo<float, false>(const LocalTensor<float> &inputX,
+    const LocalTensor<float> &outputMean, const LocalTensor<float> &outputVariance,
+    const LocalTensor<float> &stackBuffer, const LayerNormTiling &tiling, LayerNormParams<float> &params)
+{
+    params.meanTmpTensor = outputMean;
+    params.varianceTmpTensor = outputVariance;
+
+    params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
+    params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
+    params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
+
+    ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+            tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+    });
+
+    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(),
+            tiling.tmpBufSize);
+    });
+}
+
+template <>
+__aicore__ inline void GetLayerNormNDTensorInfo<float, true>(const LocalTensor<float> &inputX,
+    const LocalTensor<float> &outputMean, const LocalTensor<float> &outputVariance,
+    const LocalTensor<float> &stackBuffer, const LayerNormTiling &tiling, LayerNormParams<float> &params)
+{
+    params.meanTmpTensor = outputMean;
+    params.varianceTmpTensor = outputVariance;
+
+    params.tempTensorA = inputX;
+    params.tempTensorB = stackBuffer[tiling.firstTmpStartPos];
+    params.tempTensorC = stackBuffer[tiling.secondTmpStartPos];
+
+    ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
+            tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
+    });
+
+    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
+        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(),
+            tiling.tmpBufSize);
+    });
+}
+
+__aicore__ inline void GetOutputMeanVariance(const LocalTensor<half>& outputMean,
+    const LocalTensor<half>& outputVariance, const LayerNormTiling& tiling, const LayerNormParams<float>& params)
+{
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.meanVarSize);
+
+    UnaryRepeatParams unaryParams;
+    unaryParams.dstRepStride = DEFAULT_REPEAT_STRIDE / sizeof(half);
+
+    Cast<half, float, false>(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Cast<half, float, false>(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+        unaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void WelfordUpdateComputeMean(const LocalTensor<float>& tmpVreg, const LocalTensor<float>& src,
+    const LocalTensor<float>& inMean, const LocalTensor<float>& outVreg, const LocalTensor<float>& outMean,
+    const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams, const WelfordUpdateParam &para)
+{
+    PipeBarrier<PIPE_V>();
+    Sub<float, false>(tmpVreg, src, inMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+    Muls<float, false>(outVreg, tmpVreg, static_cast<float>(para.nRec), MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+    Add<float, false>(outMean, outVreg, inMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void WelfordUpdateComputeVar(const LocalTensor<float>& tmpVreg, const LocalTensor<float>& inVar,
+    const LocalTensor<float>& outVar, const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams,
+    const WelfordUpdateParam &para)
+{
+    PipeBarrier<PIPE_V>();
+    Add<float, false>(outVar, tmpVreg, inVar, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T, typename U, bool isReuseSource = false>
+__aicore__ inline constexpr uint32_t WelfordUpdateGetTmpSize()
+{
+    if constexpr (sizeof(T) == sizeof(half)) {
+        return 0x3;
+    }
+
+    if constexpr (isReuseSource) {
+        return 1;
+    }
+    return 0x2;
+}
+
+__aicore__ inline void GetLayerNormOutputMean(const LocalTensor<float>& outputMean, const LocalTensor<float>& inputX,
+    const LayerNormTiling& tiling, const LayerNormParams<float>& params, const LocalTensor<float>& tmpMean)
+{
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
+
+    const UnaryRepeatParams unaryParams;
+    Muls<float, false>(params.tempTensorC, inputX, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    LayerNormReduceSumImpl(tmpMean, outputMean, params.tempTensorC, tiling.bsCurLength, tiling.hLength);
+}
+
+__aicore__ inline void GetLayerNormOutputVariance(const LocalTensor<float>& outputVariance,
+    const LocalTensor<float>& inputX, const LocalTensor<float>& inputMean, const LayerNormTiling& tiling,
+    const LayerNormParams<float>& params, const LocalTensor<float>& tmpVariance)
+{
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+
+    BroadcastLastDim(tempTensorC, inputMean, tiling.bsCurLength, tiling.hLength);
+
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
+
+    const BinaryRepeatParams binaryParams;
+    Sub<float, false>(tempTensorB, inputX, tempTensorC, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Mul<float, false>(tempTensorC, tempTensorB, tempTensorB, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+
+    const UnaryRepeatParams unaryParams;
+    Muls<float, false>(tempTensorA, tempTensorC, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    LayerNormReduceSumImpl(tmpVariance, outputVariance, tempTensorA, tiling.bsCurLength, tiling.hLength);
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename U>
+__aicore__ inline void WelfordUpdateInplaceCompute(const LocalTensor<U>& outMean, const LocalTensor<U>& outVar,
+    const LocalTensor<U>& inMean, const LocalTensor<U>& inVar, const WelfordUpdateParam &para, uint32_t alignNum)
+{
+    uint32_t inPlaceLength = AlignUp(para.abLength - para.abComputeLength, alignNum);
+    uint32_t dstOffset = para.abLength - inPlaceLength;
+
+    DataCopy(outMean[dstOffset], inMean[dstOffset], inPlaceLength);
+    DataCopy(outVar[dstOffset], inVar[dstOffset], inPlaceLength);
+    PipeBarrier<PIPE_V>();
+}
+__aicore__ inline void WelfordUpdateInplace(const LocalTensor<float>& outMean, const LocalTensor<float>& outVar,
+    const LocalTensor<float>& inMean, const LocalTensor<float>& inVar, const WelfordUpdateParam &para)
+{
+    WelfordUpdateInplaceCompute(outMean, outVar, inMean, inVar, para, B32_DATA_NUM_PER_BLOCK);
+}
+
+__aicore__ inline void WelfordUpdateInplace(const LocalTensor<half>& outMean, const LocalTensor<half>& outVar,
+    const LocalTensor<half>& inMean, const LocalTensor<half>& inVar, const WelfordUpdateParam &para)
+{
+    WelfordUpdateInplaceCompute(outMean, outVar, inMean, inVar, para, B16_DATA_NUM_PER_BLOCK);
+}
+
+__aicore__ inline void GetLayerNormOutputPre(const LocalTensor<float>& xSubMean,
+    const LocalTensor<float>& inputVariance, const float epsilon, const LayerNormTiling& tiling,
+    const LayerNormParams<float>& params)
+{
+    const float exponent = -0.5;
+    LocalTensor<float> tempTensorA = params.tempTensorA;
+    LocalTensor<float> tempTensorB = params.tempTensorB;
+    LocalTensor<float> tempTensorC = params.tempTensorC;
+
+    BroadcastLastDim(tempTensorA, inputVariance, tiling.bsCurLength, tiling.hLength);
+
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
+
+    const UnaryRepeatParams unaryParams;
+    Adds<float, false>(tempTensorC, tempTensorA, epsilon, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    Sqrt<float, false>(tempTensorA, tempTensorC, MASK_PLACEHOLDER, 1, unaryParams);
+    PipeBarrier<PIPE_V>();
+
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, B32_DATA_NUM_PER_BLOCK);
+    Duplicate<float, false>(tempTensorC, 1, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
+    PipeBarrier<PIPE_V>();
+
+    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
+    Div<float, false>(tempTensorA, tempTensorC, tempTensorA, MASK_PLACEHOLDER, 1,
+        { 1, 0, 1, DEFAULT_REPEAT_STRIDE, 0, DEFAULT_REPEAT_STRIDE });
+    PipeBarrier<PIPE_V>();
+
+    const BinaryRepeatParams binaryParams;
+    Mul<float, false>(tempTensorC, tempTensorA, xSubMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+}
+
+}  // namespace AscendC
+#endif  // IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H
\ No newline at end of file
diff --git a/impl/normalization/layernorm/layernorm_common_impl.h b/impl/normalization/layernorm/layernorm_common_impl.h
index 87def13b24fb82d964c0d8e5d36bfef98768647f..db07d32162cd0ef6dd453c060bb5e38ab19900a8 100644
--- a/impl/normalization/layernorm/layernorm_common_impl.h
+++ b/impl/normalization/layernorm/layernorm_common_impl.h
@@ -18,207 +18,9 @@
 #include "kernel_tensor.h"
 #include "kernel_pop_stack_buffer.h"
 #include "kernel_tiling/kernel_tiling.h"
+#include "layernorm_common_basic_impl.h"
 
 namespace AscendC {
-constexpr uint32_t MASK_LOW_6BITS = 0x3f;
-constexpr uint32_t MASK_HIGH_26BITS = 0xFFFFFFC0;
-template <bool isRelocate = true, bool isTransposeDst = false>
-__aicore__ inline void LayerNormReduceSumImpl(const LocalTensor<float>& dstMVTmp, const LocalTensor<float>& dst,
-    const LocalTensor<float>& src, const uint32_t bsLength, const uint32_t hLength)
-{
-    ResetMask();
-    SetMaskNorm();
-    // Contract the horizontal axis to one repeat length 64 (2^6)
-    constexpr uint32_t rightShiftSix = 6;
-    if (hLength > ONE_REPEAT_FLOAT_SIZE) {
-        uint32_t addRepeatTime = (hLength >> rightShiftSix) - 1;
-        uint32_t addTailNumber = (hLength & MASK_LOW_6BITS);
-        if ((hLength & MASK_LOW_6BITS) == 0) {
-            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
-                LocalTensor<float> dstTmp = src[i];
-                LocalTensor<float> srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE];
-                Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime,
-                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
-                PipeBarrier<PIPE_V>();
-            }
-        } else if (addRepeatTime > 0) {
-            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
-                LocalTensor<float> dstTmp = src[i];
-                LocalTensor<float> srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE];
-                LocalTensor<float> srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)];
-                Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime,
-                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
-                PipeBarrier<PIPE_V>();
-                Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1,
-                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
-                PipeBarrier<PIPE_V>();
-            }
-        } else {
-            for (uint32_t i = 0; i < bsLength * hLength; i += hLength) {
-                LocalTensor<float> dstTmp = src[i];
-                LocalTensor<float> srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)];
-                Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1,
-                    { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 });
-                PipeBarrier<PIPE_V>();
-            }
-        }
-    }
-
-    uint32_t repeatTime = bsLength;
-    uint32_t cursorSrc = 0;
-    uint32_t wholeReduceSumHLength = (hLength > ONE_REPEAT_FLOAT_SIZE) ? ONE_REPEAT_FLOAT_SIZE : hLength;
-    constexpr uint32_t rightShiftThree = 3;
-    const uint32_t reduceSumSrcRepeatStride = hLength >> rightShiftThree;
-
-    while (repeatTime >= MAX_REPEAT_TIMES) {
-        LocalTensor<float> srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength];
-        LocalTensor<float> dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength];
-        if constexpr (isRelocate) {
-            WholeReduceSum<float>(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength,
-                MAX_REPEAT_TIMES, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
-        }
-        WholeReduceSum<float>(dstTmp, srcTmp, wholeReduceSumHLength, MAX_REPEAT_TIMES, hLength, DEFAULT_BLK_STRIDE,
-            reduceSumSrcRepeatStride);
-        PipeBarrier<PIPE_V>();
-        repeatTime -= MAX_REPEAT_TIMES;
-        ++cursorSrc;
-    }
-
-    uint32_t reduceSumSrcRepeatTimeTail = bsLength - cursorSrc * MAX_REPEAT_TIMES;
-    if (reduceSumSrcRepeatTimeTail > 0) {
-        LocalTensor<float> srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength];
-        LocalTensor<float> dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength];
-        if constexpr (isRelocate) {
-            WholeReduceSum<float>(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength,
-                reduceSumSrcRepeatTimeTail, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
-        }
-        WholeReduceSum<float>(dstTmp, srcTmp, wholeReduceSumHLength, reduceSumSrcRepeatTimeTail, hLength,
-            DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride);
-        PipeBarrier<PIPE_V>();
-    }
-
-    SetMaskCount();
-}
-
-__aicore__ inline void GetLayerNormOutputMean(const LocalTensor<float>& outputMean, const LocalTensor<float>& inputX,
-    const LayerNormTiling& tiling, const LayerNormParams<float>& params, const LocalTensor<float>& tmpMean)
-{
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
-
-    const UnaryRepeatParams unaryParams;
-    Muls<float, false>(params.tempTensorC, inputX, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    LayerNormReduceSumImpl(tmpMean, outputMean, params.tempTensorC, tiling.bsCurLength, tiling.hLength);
-}
-
-__aicore__ inline void BroadcastLastDim(const LocalTensor<float>& dst, const LocalTensor<float>& src,
-    const uint32_t bsLength, const uint32_t hLength)
-{
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, hLength);
-
-    SetCmpMask<float>(src);
-    PipeBarrier<PIPE_V>();
-
-    LocalTensor<int16_t> maskLocal = src.ReinterpretCast<int16_t>();
-
-    const UnaryRepeatParams unaryParams;
-    Muls<int16_t, false>(maskLocal, maskLocal, 0, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    const BinaryRepeatParams binaryParams;
-    Select<float, int16_t>(dst, maskLocal, dst, 1, binaryParams);
-    PipeBarrier<PIPE_V>();
-
-    for (uint32_t i = 1; i < bsLength; i++) {
-        SetCmpMask<float>(src[i * hLength]);
-        PipeBarrier<PIPE_V>();
-
-        Select<float, int16_t>(dst[i * hLength], maskLocal, dst, 1, binaryParams);
-        PipeBarrier<PIPE_V>();
-    }
-}
-
-__aicore__ inline void GetLayerNormOutputVariance(const LocalTensor<float>& outputVariance,
-    const LocalTensor<float>& inputX, const LocalTensor<float>& inputMean, const LayerNormTiling& tiling,
-    const LayerNormParams<float>& params, const LocalTensor<float>& tmpVariance)
-{
-    LocalTensor<float> tempTensorA = params.tempTensorA;
-    LocalTensor<float> tempTensorB = params.tempTensorB;
-    LocalTensor<float> tempTensorC = params.tempTensorC;
-
-    BroadcastLastDim(tempTensorC, inputMean, tiling.bsCurLength, tiling.hLength);
-
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
-
-    const BinaryRepeatParams binaryParams;
-    Sub<float, false>(tempTensorB, inputX, tempTensorC, MASK_PLACEHOLDER, 1, binaryParams);
-    PipeBarrier<PIPE_V>();
-
-    Mul<float, false>(tempTensorC, tempTensorB, tempTensorB, MASK_PLACEHOLDER, 1, binaryParams);
-    PipeBarrier<PIPE_V>();
-
-    const UnaryRepeatParams unaryParams;
-    Muls<float, false>(tempTensorA, tempTensorC, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    LayerNormReduceSumImpl(tmpVariance, outputVariance, tempTensorA, tiling.bsCurLength, tiling.hLength);
-    PipeBarrier<PIPE_V>();
-}
-
-__aicore__ inline void GetLayerNormOutputPre(const LocalTensor<float>& xSubMean,
-    const LocalTensor<float>& inputVariance, const float epsilon, const LayerNormTiling& tiling,
-    const LayerNormParams<float>& params)
-{
-    const float exponent = -0.5;
-    LocalTensor<float> tempTensorA = params.tempTensorA;
-    LocalTensor<float> tempTensorB = params.tempTensorB;
-    LocalTensor<float> tempTensorC = params.tempTensorC;
-
-    BroadcastLastDim(tempTensorA, inputVariance, tiling.bsCurLength, tiling.hLength);
-
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
-
-    const UnaryRepeatParams unaryParams;
-    Adds<float, false>(tempTensorC, tempTensorA, epsilon, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    Sqrt<float, false>(tempTensorA, tempTensorC, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, B32_DATA_NUM_PER_BLOCK);
-    Duplicate<float, false>(tempTensorC, 1, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
-    PipeBarrier<PIPE_V>();
-
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.bshCurLength);
-    Div<float, false>(tempTensorA, tempTensorC, tempTensorA, MASK_PLACEHOLDER, 1,
-        { 1, 0, 1, DEFAULT_REPEAT_STRIDE, 0, DEFAULT_REPEAT_STRIDE });
-    PipeBarrier<PIPE_V>();
-
-    const BinaryRepeatParams binaryParams;
-    Mul<float, false>(tempTensorC, tempTensorA, xSubMean, MASK_PLACEHOLDER, 1, binaryParams);
-    PipeBarrier<PIPE_V>();
-}
-
-__aicore__ inline void DuplicateMulImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src0,
-    const LocalTensor<float>& src1, const uint32_t bsLength, const uint32_t hLength)
-{
-    const BinaryRepeatParams binaryParams;
-    for (uint32_t i = 0; i < bsLength; i++) {
-        Mul<float, false>(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams);
-    }
-    PipeBarrier<PIPE_V>();
-}
-
-__aicore__ inline void DuplicateAddImpl(const LocalTensor<float>& dst, const LocalTensor<float>& src0,
-    const LocalTensor<float>& src1, const uint32_t bsLength, const uint32_t hLength)
-{
-    const BinaryRepeatParams binaryParams;
-    for (uint32_t i = 0; i < bsLength; i++) {
-        Add<float, false>(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams);
-    }
-    PipeBarrier<PIPE_V>();
-}
 
 template <typename T>
 __aicore__ inline void GetLayerNormOutput(const LocalTensor<T>& output, const LocalTensor<float>& inputY,
@@ -323,88 +125,6 @@ __aicore__ inline void LayerNormExe<float>(const LocalTensor<float>& inputX, con
     GetLayerNormOutput(output, tempTensorC, gamma, beta, tiling, params);
 }
 
-template <typename T, bool isReuseSource = false>
-__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor<T>& inputX, const LocalTensor<T>& outputMean,
-    const LocalTensor<T>& outputVariance, const LocalTensor<float>& stackBuffer, const LayerNormTiling& tiling,
-    LayerNormParams<float>& params)
-{
-    params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
-    params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
-    params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
-    params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos];
-    params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos];
-    ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
-            tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
-    });
-    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tiling.tmpBufSize is (%d)",
-            stackBuffer.GetSize(), tiling.tmpBufSize);
-    });
-}
-
-template <>
-__aicore__ inline void GetLayerNormNDTensorInfo<float, false>(const LocalTensor<float> &inputX,
-    const LocalTensor<float> &outputMean, const LocalTensor<float> &outputVariance,
-    const LocalTensor<float> &stackBuffer, const LayerNormTiling &tiling, LayerNormParams<float> &params)
-{
-    params.meanTmpTensor = outputMean;
-    params.varianceTmpTensor = outputVariance;
-
-    params.tempTensorA = stackBuffer[tiling.firstTmpStartPos];
-    params.tempTensorB = stackBuffer[tiling.secondTmpStartPos];
-    params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos];
-
-    ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
-            tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
-    });
-
-    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(),
-            tiling.tmpBufSize);
-    });
-}
-
-template <>
-__aicore__ inline void GetLayerNormNDTensorInfo<float, true>(const LocalTensor<float> &inputX,
-    const LocalTensor<float> &outputMean, const LocalTensor<float> &outputVariance,
-    const LocalTensor<float> &stackBuffer, const LayerNormTiling &tiling, LayerNormParams<float> &params)
-{
-    params.meanTmpTensor = outputMean;
-    params.varianceTmpTensor = outputVariance;
-
-    params.tempTensorA = inputX;
-    params.tempTensorB = stackBuffer[tiling.firstTmpStartPos];
-    params.tempTensorC = stackBuffer[tiling.secondTmpStartPos];
-
-    ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)",
-            tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize);
-    });
-
-    ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), {
-        KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(),
-            tiling.tmpBufSize);
-    });
-}
-
-__aicore__ inline void GetOutputMeanVariance(const LocalTensor<half>& outputMean,
-    const LocalTensor<half>& outputVariance, const LayerNormTiling& tiling, const LayerNormParams<float>& params)
-{
-    SetVectorMask<uint8_t, MaskMode::COUNTER>(0, tiling.meanVarSize);
-
-    UnaryRepeatParams unaryParams;
-    unaryParams.dstRepStride = DEFAULT_REPEAT_STRIDE / sizeof(half);
-
-    Cast<half, float, false>(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams);
-    PipeBarrier<PIPE_V>();
-
-    Cast<half, float, false>(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
-        unaryParams);
-    PipeBarrier<PIPE_V>();
-}
-
 template <typename T>
 __aicore__ inline void LayerNormND(const LocalTensor<T>& inputX, const LocalTensor<T>& gamma,
     const LocalTensor<T>& beta, const LocalTensor<T>& output, const LocalTensor<T>& outputMean,
@@ -477,5 +197,180 @@ __aicore__ inline void LayerNormImpl(const LocalTensor<T>& output, const LocalTe
     LayerNormImpl<T, isReuseSource>(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon,
         tiling);
 }
+
+template <bool isReuseSource = false>
+__aicore__ inline void WelfordUpdateCompute(const LocalTensor<float>& outMean, const LocalTensor<float>& outVar,
+    const LocalTensor<half>& src, const LocalTensor<float>& inMean, const LocalTensor<float>& inVar,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam &para, const uint32_t tmpNum,
+    const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams)
+{
+    LocalTensor<float> srcVreg = sharedTmpBuffer.ReinterpretCast<float>();
+    uint32_t tmpIndex = B32_DATA_NUM_PER_REPEAT * tmpNum;
+    LocalTensor<float> tmpVreg = srcVreg[tmpIndex];
+    LocalTensor<float> outVreg = srcVreg[tmpIndex + tmpIndex];
+
+    PipeBarrier<PIPE_V>();
+    Cast<float, half, false>(srcVreg, src, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+        {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE});
+
+    WelfordUpdateComputeMean(tmpVreg, srcVreg, inMean, outVreg, outMean, unaryParams, binaryParams, para);
+
+    Sub<float, false>(outVreg, srcVreg, outMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+    Mul<float, false>(tmpVreg, tmpVreg, outVreg, MASK_PLACEHOLDER, 1, binaryParams);
+
+    WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para);
+}
+
+__aicore__ inline void WelfordUpdateComputeTo32Res(const LocalTensor<float>& outMean, const LocalTensor<float>& outVar,
+    const LocalTensor<float>& src, const LocalTensor<float>& inMean, const LocalTensor<float>& inVar,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam &para, const uint32_t tmpNum,
+    const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams)
+{
+    LocalTensor<float> tmpVreg = sharedTmpBuffer.ReinterpretCast<float>();
+
+    WelfordUpdateComputeMean(tmpVreg, src, inMean, tmpVreg, outMean, unaryParams, binaryParams, para);
+
+    Sub<float, false>(tmpVreg, src, outMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+    Sub<float, false>(src, src, inMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+    Mul<float, false>(tmpVreg, tmpVreg, src, MASK_PLACEHOLDER, 1, binaryParams);
+
+    WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para);
+}
+
+__aicore__ inline void WelfordUpdateComputeTo32(const LocalTensor<float>& outMean, const LocalTensor<float>& outVar,
+    const LocalTensor<float>& src, const LocalTensor<float>& inMean, const LocalTensor<float>& inVar,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam &para, const uint32_t tmpNum,
+    const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams)
+{
+    LocalTensor<float> tmpVreg = sharedTmpBuffer.ReinterpretCast<float>();
+    LocalTensor<float> outVreg = tmpVreg[B32_DATA_NUM_PER_REPEAT * tmpNum];
+
+    WelfordUpdateComputeMean(tmpVreg, src, inMean, outVreg, outMean, unaryParams, binaryParams, para);
+
+    Sub<float, false>(outVreg, src, outMean, MASK_PLACEHOLDER, 1, binaryParams);
+    PipeBarrier<PIPE_V>();
+    Mul<float, false>(tmpVreg, tmpVreg, outVreg, MASK_PLACEHOLDER, 1, binaryParams);
+
+    WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para);
+}
+
+template <bool isReuseSource = false>
+__aicore__ inline void WelfordUpdateCompute(const LocalTensor<float>& outMean, const LocalTensor<float>& outVar,
+    const LocalTensor<float>& src, const LocalTensor<float>& inMean, const LocalTensor<float>& inVar,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam &para, const uint32_t tmpNum,
+    const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams)
+{
+    if (isReuseSource) {
+        WelfordUpdateComputeTo32Res(outMean, outVar, src, inMean, inVar, sharedTmpBuffer, para, tmpNum, unaryParams,
+            binaryParams);
+    } else {
+        WelfordUpdateComputeTo32(outMean, outVar, src, inMean, inVar, sharedTmpBuffer, para, tmpNum, unaryParams,
+            binaryParams);
+    }
+}
+
+template <typename T, typename U, bool isReuseSource = false>
+__aicore__ inline void WelfordUpdateComputeImpl(const LocalTensor<U>& outMean, const LocalTensor<U>& outVar,
+    const LocalTensor<T>& src, const LocalTensor<U>& inMean, const LocalTensor<U>& inVar,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam &para)
+{
+    constexpr uint32_t tmpBufNum = WelfordUpdateGetTmpSize<T, U, isReuseSource>();
+
+    uint32_t tmpNum = sharedTmpBuffer.GetSize() / (ONE_REPEAT_BYTE_SIZE * tmpBufNum);
+#if ASCENDC_CPU_DEBUG
+    ASCENDC_ASSERT((tmpNum != 0), {
+        KERNEL_LOG(KERNEL_ERROR,
+            "Failed to check the size of sharedTmpBuffer, the size of sharedTmpBuffer is %dB, it is smaller.",
+            sharedTmpBuffer.GetSize());
+    });
+#endif
+
+    const uint32_t round = para.abComputeLength / (B32_DATA_NUM_PER_REPEAT * tmpNum);
+    const uint32_t tail = para.abComputeLength % (B32_DATA_NUM_PER_REPEAT * tmpNum);
+
+    SetVectorMask<float, MaskMode::COUNTER>(0, B32_DATA_NUM_PER_REPEAT * tmpNum);
+    uint32_t offset = 0;
+
+    const UnaryRepeatParams unaryParams;
+    const BinaryRepeatParams binaryParams;
+
+    for (uint32_t i = 0; i < round; ++i) {
+        WelfordUpdateCompute<isReuseSource>(outMean[offset], outVar[offset], src[offset], inMean[offset],
+            inVar[offset], sharedTmpBuffer, para, tmpNum, unaryParams, binaryParams);
+        offset = offset + B32_DATA_NUM_PER_REPEAT * tmpNum;
+    }
+
+    if (tail != 0) {
+        SetVectorMask<float, MaskMode::COUNTER>(0, tail);
+        WelfordUpdateCompute<isReuseSource>(outMean[offset], outVar[offset], src[offset], inMean[offset],
+            inVar[offset], sharedTmpBuffer, para, tmpNum, unaryParams, binaryParams);
+    }
+}
+
+template <typename T, typename U, bool isReuseSource = false, const WelfordUpdateConfig &config = WFUPDATE_DEFAULT_CFG>
+__aicore__ inline void WelfordUpdateImpl(const LocalTensor<U>& outputMean, const LocalTensor<U>& outputVariance,
+    const LocalTensor<U>& inputMean, const LocalTensor<U>& inputVariance, const LocalTensor<T>& inputX,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam& para)
+{
+    static_assert((std::is_same<T, float>::value || std::is_same<T, half>::value),
+        "Failed to check dtype of inputX, inputX support dtype is: half/float.");
+    static_assert((std::is_same<U, float>::value),
+        "Failed to check dtype of mean/var, mean/var support dtype is: float.");
+#if ASCENDC_CPU_DEBUG
+    ASCENDC_ASSERT(((QuePosition)inputX.GetPosition() == TPosition::VECIN ||
+        (QuePosition)inputX.GetPosition() == TPosition::VECOUT ||
+        (QuePosition)inputX.GetPosition() == TPosition::VECCALC), {
+        KERNEL_LOG(KERNEL_ERROR,
+            "Failed to check dtype of input position, support position is VECIN, VECOUT, VECCALC.");
+    });
+    ASCENDC_ASSERT((para.abLength <= inputX.GetSize()), {
+        KERNEL_LOG(KERNEL_ERROR,
+        "Failed to check para.abLength, current size is %u, which should not larger than inputX size %u.",
+            para.abLength, inputX.GetSize());
+    });
+    ASCENDC_ASSERT((para.abComputeLength <= para.abLength), {
+        KERNEL_LOG(KERNEL_ERROR,
+        "Failed to check para.abComputeLength, current size is %u, which should not larger than abLength size %u.",
+            para.abComputeLength, para.abLength);
+    });
+    ASCENDC_ASSERT((para.abComputeLength > 0), {
+        KERNEL_LOG(KERNEL_ERROR,
+        "Failed to check para.abComputeLength, para.abComputeLength should be greater than 0.",
+            para.abComputeLength, para.abLength);
+    });
+    ASCENDC_ASSERT((para.rnLength == 1), {
+        KERNEL_LOG(KERNEL_ERROR,
+        "Failed to check para.rnLength, rnLength is %u, which should is 1.", para.rnLength);
+    });
+    ASCENDC_ASSERT((para.abLength % (ONE_BLK_SIZE / sizeof(T)) == 0), {
+        KERNEL_LOG(KERNEL_ERROR,
+            "Failed to check para.abLength, para.abLength should be 32B aligned.");
+    });
+#endif
+    SetMaskCount();
+    if (config.isInplace && (para.abComputeLength < para.abLength)) {
+        WelfordUpdateInplace(outputMean, outputVariance, inputMean, inputVariance, para);
+    }
+    WelfordUpdateComputeImpl<T, U, isReuseSource>(outputMean, outputVariance, inputX, inputMean, inputVariance,
+        sharedTmpBuffer, para);
+    SetMaskNorm();
+    ResetMask();
+}
+
+template <typename T, typename U, bool isReuseSource = false, const WelfordUpdateConfig &config = WFUPDATE_DEFAULT_CFG>
+__aicore__ inline void WelfordUpdateImpl(const LocalTensor<U>& outMean, const LocalTensor<U>& outVar,
+    const LocalTensor<U>& inMean, const LocalTensor<U>& inVar, const LocalTensor<T>& srcUb,
+    const WelfordUpdateParam& para)
+{
+    LocalTensor<uint8_t> stackTensor;
+    bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(stackTensor);
+    ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
+
+    WelfordUpdateImpl<T, U, isReuseSource, config>(outMean, outVar, inMean, inVar, srcUb, stackTensor, para);
+}
+
 } // namespace AscendC
 #endif // IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_IMPL_H
\ No newline at end of file
diff --git a/impl/normalization/layernorm/layernorm_tiling_impl.cpp b/impl/normalization/layernorm/layernorm_tiling_impl.cpp
index 47dc41644012d164b7e5cf6c0956bdd73d791dfe..f0d93b335e7ac4fa78ba578f4f8c48034ac7ecbc 100644
--- a/impl/normalization/layernorm/layernorm_tiling_impl.cpp
+++ b/impl/normalization/layernorm/layernorm_tiling_impl.cpp
@@ -24,6 +24,9 @@ constexpr uint32_t LAYERNORM_ONE_BLK_SHIFT_AMOUNT = 5;
 constexpr uint32_t LAYERNORM_ONE_NUMBER = 1;
 constexpr uint32_t LAYERNOR_ZERO_NUMBER = 0;
 constexpr float LAYERNOR_LAST_DIM_INIT_VALUE = 1.0;
+constexpr uint32_t WEL_UP_REP_SIZE = 256;
+constexpr uint32_t WEL_UP_FLOAT_SIZE = 256 / sizeof(float);
+constexpr uint32_t SHAPE_DIM = 2;
 
 uint32_t GetLayerNormMaxTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource)
 {
@@ -166,4 +169,27 @@ void GetLayerNormNDTillingInfo(const ge::Shape& srcShape, const uint32_t stackBu
     tilling.set_bsCurLength(bsCurLength);
     tilling.set_lastDimValueBack(lastDimValueBack);
 }
+
+void GetWelfordUpdateMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSizeT, const uint32_t typeSizeU,
+    const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue)
+{
+    (void)isInplace;
+    (void)typeSizeU;
+
+    std::vector<int64_t> shapeDims = srcShape.GetDims();
+    ASCENDC_HOST_ASSERT(shapeDims.size() == SHAPE_DIM, return, "srcShape dims must be 2.");
+
+    const uint32_t rnLength = static_cast<uint32_t>(shapeDims[0]);
+    const uint32_t abLength = static_cast<uint32_t>(shapeDims[1]);
+
+    if (typeSizeT == sizeof(uint16_t)) {
+        minValue = 0x3 * WEL_UP_REP_SIZE; // dispense 3 buffers
+    } else if (isReuseSource) {
+        minValue = 1 * WEL_UP_REP_SIZE; // dispense 1 buffer
+    } else {
+        minValue = 0x2 * WEL_UP_REP_SIZE; // dispense 2 buffers
+    }
+    maxValue = (rnLength * abLength + WEL_UP_FLOAT_SIZE - 1) / WEL_UP_FLOAT_SIZE * minValue;
+}
+
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/utils/init_global_memory/init_global_memory_v200_impl.h b/impl/utils/init_global_memory/init_global_memory_v200_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..93ca1ddb5cd8cef61c6425a6fa546eeda31a1907
--- /dev/null
+++ b/impl/utils/init_global_memory/init_global_memory_v200_impl.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file init_global_memory_v200_impl.h
+ * \brief
+ */
+#ifndef IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H
+#define IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+
+namespace AscendC {
+template <typename T>
+__aicore__ inline void InitGlobalMemoryImpl(GlobalTensor<T> &gmWorkspaceAddr, const uint64_t size, const T value)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    LocalTensor<T> popBuffer;
+    constexpr uint32_t MAX_REPEAT_LEN = 256;
+    bool ret = PopStackBuffer<T, TPosition::LCM>(popBuffer);
+    ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "No space left to allocate in Unified Buffer"); });
+    constexpr uint32_t maxBurstSize = (MAX_REPEAT_TIMES * MAX_REPEAT_LEN) / sizeof(T);
+    const uint32_t popSize = popBuffer.GetSize() >= maxBurstSize ? maxBurstSize : popBuffer.GetSize();
+    const uint32_t round = size / popSize;
+    const uint32_t tail = size % popSize;
+    const uint32_t roundSize = round != 0 ? popSize : 0;
+    Duplicate<T>(popBuffer, value, popSize);
+    event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
+    WaitFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
+    struct DataCopyParams repeatParams;
+    repeatParams.blockCount = 1;
+    uint32_t comOffset = 0;
+    // compute the main block
+    if ((roundSize * sizeof(T)) % ONE_BLK_SIZE == 0) {
+        repeatParams.blockLen = static_cast<uint32_t>(roundSize * sizeof(T)) / ONE_BLK_SIZE;
+        for (uint32_t index = 0; index < round; ++index) {
+            DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams);
+            comOffset += roundSize;
+        }
+    } else {
+        const uint32_t roundSizeExtra = roundSize * sizeof(T) % ONE_BLK_SIZE;
+        const uint32_t roundSizeAlign = roundSize * sizeof(T) - roundSizeExtra;
+        repeatParams.blockLen = static_cast<uint16_t>(roundSizeAlign) / ONE_BLK_SIZE;
+        for (uint32_t index = 0; index < round; ++index) {
+            DataCopy(gmWorkspaceAddr[comOffset],popBuffer, repeatParams);
+            comOffset += roundSize;
+            for (uint64_t i = comOffset - roundSizeExtra / sizeof(T); i < comOffset; ++i) {
+                gmWorkspaceAddr.SetValue(i, value);
+            }
+        }
+    }
+    // compute the tail block
+    if (tail != 0) {
+        if ((tail * sizeof(T)) % ONE_BLK_SIZE == 0) {
+            repeatParams.blockLen = static_cast<uint32_t>(tail * sizeof(T)) / ONE_BLK_SIZE;
+            comOffset = round * roundSize;
+            DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams);
+        } else {
+            const uint32_t tailExtra = tail * sizeof(T) % ONE_BLK_SIZE;
+            const uint32_t tailAlign = tail * sizeof(T) - tailExtra;
+            repeatParams.blockLen = static_cast<uint16_t>(tailAlign) / ONE_BLK_SIZE;
+            comOffset = round * roundSize;
+            DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams);
+            for (uint64_t i = comOffset + tailAlign / sizeof(T); i < size; ++i) {
+                gmWorkspaceAddr.SetValue(i, value);
+            }
+        }
+    }
+    PipeBarrier<PIPE_MTE3>();
+}
+} // namespace AscendC
+#endif // IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H
diff --git a/impl/utils/init_global_memory/init_global_memory_v220_impl.h b/impl/utils/init_global_memory/init_global_memory_v220_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9035a4c12cd9297b8e18048e5adf7e9064c9c15c
--- /dev/null
+++ b/impl/utils/init_global_memory/init_global_memory_v220_impl.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file init_global_memory_v220_impl.h
+ * \brief
+ */
+#ifndef IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H
+#define IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H
+
+#include "kernel_tensor.h"
+#include "kernel_operator_intf.h"
+
+namespace AscendC {
+template <typename T>
+__aicore__ inline void InitGlobalMemoryImpl(GlobalTensor<T> &gmWorkspaceAddr, const uint64_t size, const T value)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    LocalTensor<T> popBuffer;
+    constexpr uint32_t MAX_REPEAT_LEN = 256;
+    bool ret = PopStackBuffer<T, TPosition::LCM>(popBuffer);
+    ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "No space left to allocate in Unified Buffer"); });
+    constexpr uint32_t maxBurstSize = (MAX_REPEAT_TIMES * MAX_REPEAT_LEN) / sizeof(T);
+    const uint32_t popSize = popBuffer.GetSize() >= maxBurstSize ? maxBurstSize : popBuffer.GetSize();
+    const uint32_t round = size / popSize;
+    const uint32_t tail = size % popSize;
+    const uint32_t roundSize = round != 0 ? popSize : 0;
+    Duplicate<T>(popBuffer, value, popSize);
+    event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
+    SetFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
+    WaitFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
+    struct DataCopyExtParams repeatParams;
+    repeatParams.blockCount = 1;
+    uint32_t comOffset = 0;
+    // compute the main block
+    repeatParams.blockLen = static_cast<uint32_t>(roundSize * sizeof(T));
+    for (uint32_t index = 0; index < round; ++index) {
+        DataCopyPad(gmWorkspaceAddr[comOffset], popBuffer, repeatParams);
+        comOffset += roundSize;
+    }
+    // compute the tail block
+    repeatParams.blockLen = static_cast<uint32_t>(tail * sizeof(T));
+    if (tail != 0) {
+        comOffset = round * roundSize;
+        DataCopyPad(gmWorkspaceAddr[comOffset], popBuffer, repeatParams);
+    }
+    PipeBarrier<PIPE_MTE3>();
+}
+} // namespace AscendC
+#endif // IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H
diff --git a/lib/matmul/constant_tiling.h b/lib/matmul/constant_tiling.h
index 4ddf535bc886f75a65dbaacc69c775a1d3927ba9..f4d9754a3de115baf879906882bcb755ce623f34 100644
--- a/lib/matmul/constant_tiling.h
+++ b/lib/matmul/constant_tiling.h
@@ -17,9 +17,9 @@
 
 #include "../../impl/matmul/matmul_constant_tiling_impl.h"
 
-namespace Gemm {
+namespace AscendC {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
-__aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = L1_SIZE)
+__aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = Impl::L1_SIZE)
 {
     MatmulApiStaticTiling tiling;
     tiling.cfg = mmCFG;
@@ -69,5 +69,5 @@ __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig
     tiling.shareUbSize = 0;
     return tiling;
 }
-} // namespace matmul
+} // namespace AscendC
 #endif // LIB_MATMUL_CONSTANT_TILING_H
\ No newline at end of file
diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h
index f1884fb234eafd8ae9789f00dff0c64bd983c3fa..0b8aaa043d610ccf7f2e13421967cd4c68a2ac26 100644
--- a/lib/matmul/matmul.h
+++ b/lib/matmul/matmul.h
@@ -20,8 +20,8 @@
 #include "lib/matmul/constant_tiling.h"
 #include "../../impl/matmul/matmul_call_back.h"
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG>
 struct MatmulApiConfig {
@@ -114,8 +114,8 @@ public:
     using CallBack = MM_CB;
 };
 
-} // namespace Gemm
+} // namespace AscendC
 // Compatible with the previously used matmul namespace
-namespace matmul = Gemm;  
+namespace matmul = AscendC;
 #include "../../impl/matmul/matmul_impl.h"
 #endif
diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h
index 9da6e34ae5592b50c8ebde06e4298ddd9f8bfed8..dd474ffc613605a1143bda45eb94532297518f5b 100644
--- a/lib/matmul/matmul_client.h
+++ b/lib/matmul/matmul_client.h
@@ -26,8 +26,8 @@
 #include "../../impl/matmul/matmul_server.h"
 #endif
 
-namespace Gemm {
-using namespace AscendC;
+namespace AscendC {
+
 
 constexpr int32_t VECTOR_QUANT_MODE = 2;
 
@@ -1216,5 +1216,5 @@ class MatmulClient<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLI
 public:
     __aicore__ inline MatmulClient() {}
 };
-} // namespace matmul
+} // namespace AscendC
 #endif
\ No newline at end of file
diff --git a/lib/matmul/matmul_intf.h b/lib/matmul/matmul_intf.h
index 91808926628897d5edb1eac6cbe502d83f1d8d49..5aa644f3f4066ac1e14732b2292ee5b3ab734561 100644
--- a/lib/matmul/matmul_intf.h
+++ b/lib/matmul/matmul_intf.h
@@ -21,61 +21,49 @@
 #include "lib/matmul/matmul.h"
 #endif
 
+namespace AscendC {
 #define REGIST_MATMUL_OBJ_STATIC REGIST_CUBE_OBJ
 #define REGIST_MATMUL_OBJ REGIST_CUBE_OBJ
 #ifdef ASCENDC_CPU_DEBUG
 #if __CCE_AICORE__ == 220
 #ifdef ASCENDC_CUBE_ONLY
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #else
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulClient<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #endif
 
 #else
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #endif
 
 #else
 
 #ifdef __DAV_C220_CUBE__
 #ifdef ASCENDC_CUBE_ONLY
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #else
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulServiceAux<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #endif
 
 #elif defined(__DAV_C220_VEC__)
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulClient<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-}
 #else
-namespace Gemm {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE = C_TYPE, const auto& MM_CFG = CFG_NORM,
     class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)>
 using Matmul = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY...>;
-} //namespace Gemm
 #endif
 #endif
+} //namespace AscendC
 #endif
\ No newline at end of file
diff --git a/lib/normalization/layernorm.h b/lib/normalization/layernorm.h
index ced532ca3538eda802da930b4e8dc23f16fb0702..c234a73ea6d4e2e787d7e885958954e5410ca413 100644
--- a/lib/normalization/layernorm.h
+++ b/lib/normalization/layernorm.h
@@ -67,6 +67,55 @@ __aicore__ inline void LayerNorm(const LocalTensor<T>& output, const LocalTensor
 {
     LayerNormImpl<T, isReuseSource>(output, outputMean, outputVariance, inputX, gamma, beta, epsilon, tiling);
 }
+
+/*!
+ * \brief Calculate the mean and variance for each time using the Welford algorithm.
+ *
+ * \note support data type: T(half and float)、U(float)
+ *
+ * \param [out] outputMean, output LocalTensor, shape is [A, R]
+ * \param [out] outputVariance, output LocalTensor, shape is [A, R]
+ * \param [in] inputMean, input LocalTensor, shape is [A, R]
+ * \param [in] inputVariance, input LocalTensor, shape is [A, R]
+ * \param [in] inputX, input LocalTensor, shape is [A, R]
+ * \param [in] para, para detailed information about the original data shape
+ */
+template <typename T, typename U, bool isReuseSource = false, const WelfordUpdateConfig& config = WFUPDATE_DEFAULT_CFG>
+__aicore__ inline void WelfordUpdate(const LocalTensor<U>& outputMean, const LocalTensor<U>& outputVariance,
+    const LocalTensor<U>& inputMean, const LocalTensor<U>& inputVariance, const LocalTensor<T>& inputX,
+    const WelfordUpdateParam& para)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    WelfordUpdateImpl<T, U, isReuseSource, config>(outputMean, outputVariance, inputMean, inputVariance, inputX, para);
+}
+
+/*!
+ * \brief Calculate the mean and variance for each time using the Welford algorithm.
+ *
+ * \note support data type: T(half and float)、U(float)
+ *
+ * \param [out] outputMean, output LocalTensor, shape is [A, R]
+ * \param [out] outputVariance, output LocalTensor, shape is [A, R]
+ * \param [in] inputMean, input LocalTensor, shape is [A, R]
+ * \param [in] inputVariance, input LocalTensor, shape is [A, R]
+ * \param [in] inputX, input LocalTensor, shape is [A, R]
+ * \param [in] sharedTmpBuffer, input local temporary Tensor
+ * \param [in] para, para detailed information about the original data shape
+ */
+template <typename T, typename U, bool isReuseSource = false, const WelfordUpdateConfig& config = WFUPDATE_DEFAULT_CFG>
+__aicore__ inline void WelfordUpdate(const LocalTensor<U>& outputMean, const LocalTensor<U>& outputVariance,
+    const LocalTensor<U>& inputMean, const LocalTensor<U>& inputVariance, const LocalTensor<T>& inputX,
+    const LocalTensor<uint8_t>& sharedTmpBuffer, const WelfordUpdateParam& para)
+{
+    if ASCEND_IS_AIC {
+        return;
+    }
+    WelfordUpdateImpl<T, U, isReuseSource, config>(outputMean, outputVariance, inputMean, inputVariance, inputX,
+        sharedTmpBuffer, para);
+}
+
 #pragma end_pipe
 } // namespace AscendC
 #endif // LIB_NORMALIZATION_LAYERNORM_H
\ No newline at end of file
diff --git a/lib/normalization/layernorm_tiling.h b/lib/normalization/layernorm_tiling.h
index a82fb2c4aef5bee3aaf39573b5e6845557d2dce2..4d8f086cd4fc4b42bad0a85bcebb13efc427e412 100644
--- a/lib/normalization/layernorm_tiling.h
+++ b/lib/normalization/layernorm_tiling.h
@@ -22,5 +22,18 @@ void GetLayerNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSiz
 
 void GetLayerNormNDTillingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize,
     const bool isReuseSource, optiling::LayerNormTiling& tilling);
+
+/*!
+ * \brief calculate max and min tmp buffer size for WelfordUpdate interface.
+ * \param [in] srcShape: input shape
+ * \param [in] typeSizeU: data type size: sizeof(U)
+ * \param [in] typeSizeT: data type size: sizeof(T)
+ * \param [in] isReuseSource: indicate whether to reuse source tensor. Reserved paramater.
+ * \param [in] isInplace: indicate whether outputs that are not calculated are multiplexed inputs.
+ * \param [out] maxValue: max size required for tmp buffer
+ * \param [out] minValue: min size required for tmp buffer
+ */
+void GetWelfordUpdateMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSizeT, const uint32_t typeSizeU,
+    const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue);
 }
 #endif // LIB_NORMALIZATION_LAYERNORM_TILING_H
diff --git a/lib/utils/init_global_memory.h b/lib/utils/init_global_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..e64b08f72083c1f26ab6c024acf5ee11722ff382
--- /dev/null
+++ b/lib/utils/init_global_memory.h
@@ -0,0 +1,51 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/* !
+ * \file init_global_memory.h
+ * \brief
+ */
+#ifndef LIB_UTILS_INIT_GLOBAL_MEMORY_H
+#define LIB_UTILS_INIT_GLOBAL_MEMORY_H
+
+#if __CCE_AICORE__ == 200
+#include "../../impl/utils/init_global_memory/init_global_memory_v200_impl.h"
+#elif __CCE_AICORE__ == 220
+#include "../../impl/utils/init_global_memory/init_global_memory_v220_impl.h"
+#endif
+
+namespace AscendC {
+/* !
+ * \brief This function realizes the clear global memory function. 
+ *
+ * \note support data type: uint16_t, int16_t, half, float, uint32_t, int32_t
+ *
+ * \param [out] GlobalTensor
+ * \param [in] size, size of space to be initialized
+ * \param [in] value, value to be initialized in global memory
+ */
+#if __CCE_AICORE__ == 200
+template <typename T>
+__aicore__ inline __in_pipe__(V)
+    __out_pipe__(MTE3, S) void InitGlobalMemory(GlobalTensor<T> &gmWorkspaceAddr, const uint64_t size, const T value)
+{
+    InitGlobalMemoryImpl<T>(gmWorkspaceAddr, size, value);
+}
+
+#elif __CCE_AICORE__ == 220
+template <typename T>
+__aicore__ inline __in_pipe__(V)
+    __out_pipe__(MTE3) void InitGlobalMemory(GlobalTensor<T> &gmWorkspaceAddr, const uint64_t size, const T value)
+{
+    InitGlobalMemoryImpl<T>(gmWorkspaceAddr, size, value);
+}
+#endif
+} // namespace AscendC
+#endif // LIB_UTILS_INIT_GLOBAL_MEMORY_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e02da30f529c357856fc504d3fbae661aadcbb76..7823902ac9f8a209ab95882735563ac57a6b904d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp
     ${ASCENDC_TESTS_DIR}/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp
     ${ASCENDC_TESTS_DIR}/matmul/test_operator_matmul_v200.cpp
+    ${ASCENDC_TESTS_DIR}/normalization/welfordupdate/test_operator_welfordupdate.cpp
 )
 
 # ascend910B1 aiv test cases
@@ -82,6 +83,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
     # ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgrad.cpp
     ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgradbeta.cpp
     ${ASCENDC_TESTS_DIR}/normalization/rmsnorm/test_operator_rmsnorm.cpp
+    ${ASCENDC_TESTS_DIR}/normalization/welfordupdate/test_operator_welfordupdate.cpp
     ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_scalar.cpp
     ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp
     ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_weight.cpp
@@ -90,6 +92,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES
     ${ASCENDC_TESTS_DIR}/quantization/quant/test_operator_quant.cpp
     ${ASCENDC_TESTS_DIR}/quantization/quant/test_operator_quant_per_channel.cpp
     ${ASCENDC_TESTS_DIR}/sort/topk/test_operator_topk.cpp
+    ${ASCENDC_TESTS_DIR}/utils/init_global_memory/test_operator_init_global_memory.cpp
 )
 
 # ascend910B1 aic test cases
diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp
index b89a24bd962154a840dbff3c15ba46ff6c48ed00..924ac98106b928231d4b0b233d54bc4828bf5f69 100644
--- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp
+++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template<typename T>
@@ -77,7 +77,7 @@ public:
 
 private:
     TQueBind<TPosition::GM, INPUT_TYPE::TAG == InputTypeTag::A ? TPosition::A1 : TPosition::B1,
-        QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
+        1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
     LocalTensor<SrcT> tensor_;
     int32_t cacheProc_ = 0;
 };
diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp
index 51f5eae6013bb53ce02d02321ad3f00022757cb9..1ff4ef02ca097134997742eaa6e3195d5524e03a 100644
--- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp
+++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template<typename T>
@@ -77,7 +77,7 @@ public:
 
 private:
     TQueBind<TPosition::GM, INPUT_TYPE::TAG == InputTypeTag::A ? TPosition::A1 : TPosition::B1,
-        QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
+        1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
     LocalTensor<SrcT> tensor_;
     int32_t cacheProc_ = 0;
 };
diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp
index 6d8184993173f7fdb876fdecceabb6d53426863d..428d5654630b660acf0c3e295bb1337e817c45d2 100644
--- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp
+++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template<typename T>
@@ -77,7 +77,7 @@ public:
 
 private:
     TQueBind<TPosition::GM, INPUT_TYPE::TAG == InputTypeTag::A ? TPosition::A1 : TPosition::B1,
-        QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
+        1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
     LocalTensor<SrcT> tensor_;
     int32_t cacheProc_ = 0;
 };
diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp
index baa79f58697960e9e4348bbf38f773276bb658d2..a1325f506f8b6bdc7decc70fadbf000c65dbda8b 100644
--- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp
+++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template<typename T>
@@ -77,7 +77,7 @@ public:
 
 private:
     TQueBind<TPosition::GM, INPUT_TYPE::TAG == InputTypeTag::A ? TPosition::A1 : TPosition::B1,
-        QUEUE_DEPTH, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
+        1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_;
     LocalTensor<SrcT> tensor_;
     int32_t cacheProc_ = 0;
 };
diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp
index ded109cf459d4630491b83835eb461a0f5fca20b..ad72f0f2caa418e4bbec000a0389fa5b168c946b 100644
--- a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp
+++ b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB,
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp
index cd177f24e938ee8979e7e797a0ca1676292921e2..2dae79063ce68920e7d2f67b48327eff4ca699fe 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_buffer.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <const auto& MM_CFG, typename IMPL, typename A_TYPE, typename B_TYPE, typename C_TYPE, typename BIAS_TYPE>
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp
index e18e5e1ccf7cf70ec406ad4babba66ceba575ea6..186a1c6f65e2df151d71ef8e5128aae3a9e8b9ab 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <const auto& MM_CFG, typename IMPL, typename A_TYPE, typename B_TYPE, typename C_TYPE, typename BIAS_TYPE>
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp
index 6b07bf9b71c8694b0584e9c30154dba41d55b1b6..df32c7d21b28ecbb51d67a29e541ce8a84e0125b 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <const auto& MM_CFG, typename IMPL, typename A_TYPE, typename B_TYPE, typename C_TYPE, typename BIAS_TYPE>
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp
index ecf91f4fe74940f6686a78eca334b089740bf84a..af22f4f71153bcbad756461dbc4057a9cf32bab8 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB,
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp
index fa117425823eee798c2ddf4d1f9d3665b09345b8..aaf7887b3742bfe9069831f2369b91f37e24f393 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_buffer.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <const auto& MM_CFG, typename IMPL, typename A_TYPE, typename B_TYPE, typename C_TYPE, typename BIAS_TYPE>
diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp
index 7af390d520e16d761b304fbf64fd6b20ce2c05f4..2d9b7d7a17c2f36e1c24bc16961cfdcdb108acd9 100644
--- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp
+++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <const auto& MM_CFG, typename IMPL, typename A_TYPE, typename B_TYPE, typename C_TYPE, typename BIAS_TYPE>
diff --git a/tests/matmul/test_matmul_channel_split.cpp b/tests/matmul/test_matmul_channel_split.cpp
index 64ee080ae61248d0649d8cb1dbf5559bd28d2208..f002feae29637ed2c7287d2bd1cdefb080372afe 100644
--- a/tests/matmul/test_matmul_channel_split.cpp
+++ b/tests/matmul/test_matmul_channel_split.cpp
@@ -190,7 +190,7 @@ __aicore__ inline void main_kernel_matmul_channel_split(GM_ADDR aGM, GM_ADDR bGM
     TQue<TPosition::A1, 2, 0> qidA1;
     TQue<TPosition::B1, 2, 0> qidB1;
 
-    Gemm::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
+    AscendC::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
     mm.SetSubBlockIdx(0);
     mm.Init(&tiling, &que);
 
@@ -268,10 +268,10 @@ protected:
                                A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, enSequentialWrite)                                                                                                                                                                                            \
     namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling##_##enOuter##_##enOrderM \
     {                                                                                                                                                                                                                                                                                                           \
-        typedef Gemm::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;                                                                                                                                                                                                 \
-        typedef Gemm::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;                                                                                                                                                                                                 \
-        typedef Gemm::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;                                                                                                                                                                                                               \
-        typedef Gemm::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;                                                                                                                                                                                                   \
+        typedef AscendC::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;                                                                                                                                                                                                 \
+        typedef AscendC::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;                                                                                                                                                                                                 \
+        typedef AscendC::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;                                                                                                                                                                                                               \
+        typedef AscendC::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;                                                                                                                                                                                                   \
         constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;                                                                                                                                                                                                                           \
         constexpr static MatmulFuncParams mFuncParams{false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT, true, false, false, true};                                                                                                                                             \
         constexpr static MatmulConfig MM_CFG = GetMMConfig<configMode>(mFuncParams);                                                                                                                                                                                                                            \
diff --git a/tests/matmul/test_matmul_config.cpp b/tests/matmul/test_matmul_config.cpp
index 8b1990fc4eaf32b14c56344792d57100fb9cb69d..b37b09db8679e35f393b75bea8c1dec0fa552fa6 100644
--- a/tests/matmul/test_matmul_config.cpp
+++ b/tests/matmul/test_matmul_config.cpp
@@ -11,7 +11,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 class TestMatmulConfig : public testing::Test {
 protected:
diff --git a/tests/matmul/test_matmul_iterate_controller.cpp b/tests/matmul/test_matmul_iterate_controller.cpp
index 93905e2403d1393501ae793d44d70b6276b40c2b..6692290850150b47eb2a12849b4fe810f8d42c0b 100644
--- a/tests/matmul/test_matmul_iterate_controller.cpp
+++ b/tests/matmul/test_matmul_iterate_controller.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 using A_TYPE = MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
 using B_TYPE = MatmulType<AscendC::TPosition::TSCM, CubeFormat::ND, half>;
diff --git a/tests/matmul/test_matmul_l0c_buffer.cpp b/tests/matmul/test_matmul_l0c_buffer.cpp
index d7ce8817e9085277574235c98abc31e004bb488b..f5415b8966f8cfb27432afe31e4c8b42e416845b 100644
--- a/tests/matmul/test_matmul_l0c_buffer.cpp
+++ b/tests/matmul/test_matmul_l0c_buffer.cpp
@@ -14,7 +14,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 
diff --git a/tests/matmul/test_matmul_l0db.cpp b/tests/matmul/test_matmul_l0db.cpp
index 741b6db5278eb4b86057c3ef49b1e5014a046ec9..d53bbd5ecde4cdb5f0951ce476bd7775ea0f804a 100644
--- a/tests/matmul/test_matmul_l0db.cpp
+++ b/tests/matmul/test_matmul_l0db.cpp
@@ -199,8 +199,8 @@ __aicore__ inline void main_kernel_matmul_l0db(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR
     TQue<TPosition::A1, 2, 0> qidA1;
     TQue<TPosition::B1, 2, 0> qidB1;
 
-    Gemm::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm1;
-    Gemm::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm2;
+    AscendC::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm1;
+    AscendC::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm2;
     mm1.SetSubBlockIdx(0);
     mm1.Init(&tiling, &que);
     mm2.SetSubBlockIdx(0);
@@ -235,10 +235,10 @@ protected:
 #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL_L0DB, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \
     A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, enSequentialWrite) \
     namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##enSequentialWrite{ \
-    typedef Gemm::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
-    typedef Gemm::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
-    typedef Gemm::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
-    typedef Gemm::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
+    typedef AscendC::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
+    typedef AscendC::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
     constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;\
     constexpr static MatmulFuncParams dbFuncParams{false, false, false, false, 0, IterateOrder::UNDEF, ScheduleType::INNER_PRODUCT, true, true, false, true};\
     constexpr static MatmulConfig CFG_NORM_DB = GetMMConfig<configMode>(dbFuncParams);\
diff --git a/tests/matmul/test_matmul_shape_info.cpp b/tests/matmul/test_matmul_shape_info.cpp
index b53e9395059f1daf8a608e605a500f0e811c780a..b2b609225dddeaf47fd7fec1fdb22db453addf22 100644
--- a/tests/matmul/test_matmul_shape_info.cpp
+++ b/tests/matmul/test_matmul_shape_info.cpp
@@ -16,7 +16,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB,
diff --git a/tests/matmul/test_matmul_shape_info_left.cpp b/tests/matmul/test_matmul_shape_info_left.cpp
index a2ad9bed22af54db597588bca7052267e5f71021..7a266d20742edf57882f572b10494cdb7719e3f6 100644
--- a/tests/matmul/test_matmul_shape_info_left.cpp
+++ b/tests/matmul/test_matmul_shape_info_left.cpp
@@ -15,7 +15,7 @@
 
 using namespace std;
 using namespace AscendC;
-using namespace Gemm;
+
 
 namespace {
 template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const MatmulConfig& MM_CFG, class MM_CB,
diff --git a/tests/matmul/test_operator_matmul_v200.cpp b/tests/matmul/test_operator_matmul_v200.cpp
index f3efa9d000ccc107f08fd3b1659a3651bbde1c27..ad39ec4bcc313baf3d7b3100d50671cab2774d6c 100644
--- a/tests/matmul/test_operator_matmul_v200.cpp
+++ b/tests/matmul/test_operator_matmul_v200.cpp
@@ -198,7 +198,7 @@ __aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM,
     TQue<TPosition::A1, 2, 0> qidA1;
     TQue<TPosition::B1, 2, 0> qidB1;
 
-    Gemm::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
+    AscendC::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
     mm.SetSubBlockIdx(0);
     mm.Init(&tiling, &que);
 
@@ -337,12 +337,12 @@ protected:
 #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \
     A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling) \
     namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling{ \
-    typedef Gemm::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
-    typedef Gemm::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
-    typedef Gemm::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
-    typedef Gemm::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
+    typedef AscendC::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
+    typedef AscendC::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
     constexpr static MatmulConfig mmCFG = CFG_Mode;                                                          \
-    constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling<aType, bType, cType, biasType>(mmCFG); \
+    constexpr static MatmulApiStaticTiling mmTiling = AscendC::GetMatmulApiTiling<aType, bType, cType, biasType>(mmCFG); \
     TEST_F(TEST_KERNEL_MATMUL, Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling) \
     {                                                                                                        \
         const int32_t left_data_size = tilingParams.M_ * tilingParams.K_;                                        \
diff --git a/tests/matmul/test_operator_matmul_v220.cpp b/tests/matmul/test_operator_matmul_v220.cpp
index b3a83d4182d651f7d0d05bb8c6b10799097579ff..84f8a92334b2406bd8b51dc423da513774927ad1 100644
--- a/tests/matmul/test_operator_matmul_v220.cpp
+++ b/tests/matmul/test_operator_matmul_v220.cpp
@@ -201,7 +201,7 @@ __aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM,
     TQue<TPosition::A1, 2, 0> qidA1;
     TQue<TPosition::B1, 2, 0> qidB1;
 
-    Gemm::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
+    AscendC::MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG> mm;
     mm.SetSubBlockIdx(0);
     mm.Init(&tiling, &que);
 
@@ -375,12 +375,12 @@ protected:
 #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \
     A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling, enOuter, enOrderM) \
     namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling##_##enOuter##_##enOrderM{ \
-    typedef Gemm::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
-    typedef Gemm::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
-    typedef Gemm::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
-    typedef Gemm::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;  \
+    typedef AscendC::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;   \
+    typedef AscendC::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;       \
+    typedef AscendC::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;  \
     constexpr static MatmulConfig mmCFG = CFG_Mode;                                                          \
-    constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling<aType, bType, cType, biasType>(mmCFG); \
+    constexpr static MatmulApiStaticTiling mmTiling = AscendC::GetMatmulApiTiling<aType, bType, cType, biasType>(mmCFG); \
     constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;\
     constexpr static MatmulFuncParams mFuncParams{false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::OUTER_PRODUCT};\
     constexpr static MatmulConfig normOuterM = GetMMConfig<configMode>(mFuncParams);\
diff --git a/tests/matmul/test_operator_matmul_v300.cpp b/tests/matmul/test_operator_matmul_v300.cpp
index 19587e381dcafb0903e24474a2af42c3e44e8590..e61aae495541944429395677b4cc206e0a8d98cf 100644
--- a/tests/matmul/test_operator_matmul_v300.cpp
+++ b/tests/matmul/test_operator_matmul_v300.cpp
@@ -229,7 +229,7 @@ __aicore__ inline void kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_A
 
 
     set_atomic_none();
-    Gemm::MatmulImpl<aType, bType, cType, biasType> mm;
+    AscendC::MatmulImpl<aType, bType, cType, biasType> mm;
     if constexpr(mmMatmul) {
         REGIST_MATMUL_OBJ(&que, GetSysWorkSpacePtr(), mm);
         mm.Init(&tiling);
@@ -309,10 +309,10 @@ TilingParams g_tilingParams = { 1, 16, 32, 32, 16, 32, 32, 16, 32, 32, 1, 1, 1,
     TEST_F(TEST_KERNEL_MATMUL,                                                                                                                                                                                              \
         Kernel_Matmul_Case##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##MM_Matmul)   \
     {                                                                                                                                                                                                                       \
-        typedef Gemm::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;                                                                                                           \
-        typedef Gemm::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;                                                                                                           \
-        typedef Gemm::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;                                                                                                                         \
-        typedef Gemm::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;                                                                                                             \
+        typedef AscendC::MatmulType<AscendC::TPosition::A_Pos, CubeFormat::A_Format, A_DType, isTransposeA> aType;                                                                                                           \
+        typedef AscendC::MatmulType<AscendC::TPosition::B_Pos, CubeFormat::B_Format, B_DType, isTransposeB> bType;                                                                                                           \
+        typedef AscendC::MatmulType<AscendC::TPosition::C_Pos, CubeFormat::C_Format, C_DType> cType;                                                                                                                         \
+        typedef AscendC::MatmulType<AscendC::TPosition::BIAS_Pos, CubeFormat::BIAS_Format, BIAS_DType> biasType;                                                                                                             \
         TilingParams tilingParam = tilingParams;                                                                                                                                                                            \
         const int32_t left_data_size = tilingParam.M_ * tilingParam.K_;                                                                                                                                                     \
         const int32_t right_data_size = tilingParam.K_ * tilingParam.N_;                                                                                                                                                    \
diff --git a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7b7d2b9646af5d63e2691bf298faf25a40b7a6a
--- /dev/null
+++ b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp
@@ -0,0 +1,245 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <gtest/gtest.h>
+#define private public
+#define protect public
+#include "kernel_operator.h"
+#include <iostream>
+using namespace std;
+using namespace AscendC;
+
+constexpr uint32_t WEL_UP_BLOCK_SIZE = 32;
+constexpr WelfordUpdateConfig WELFORD_UPDATE_ENABLE_INPLACE_CFG = {true};
+constexpr WelfordUpdateConfig WELFORD_UPDATE_UNENABLE_INPLACE_CFG = {false};
+namespace TEST_CASE {
+constexpr uint32_t WEL_UP_REP_SIZE = 256;
+constexpr uint32_t WEL_UP_FLOAT_SIZE = 256 / sizeof(float);
+
+bool GetWelfordUpdateMaxMinTmpSize(const int32_t rnLength, const int32_t abLength, const uint32_t typeSizeT,
+    const uint32_t typeSizeU, const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue)
+{
+    if (typeSizeT == sizeof(uint16_t)) {
+        minValue = 0x3 * WEL_UP_REP_SIZE;
+    } else if (isReuseSource) {
+        minValue = 1 * WEL_UP_REP_SIZE;
+    } else {
+        minValue = 0x2 * WEL_UP_REP_SIZE;
+    }
+    maxValue = (rnLength * abLength + WEL_UP_FLOAT_SIZE - 1) / WEL_UP_FLOAT_SIZE * minValue;
+    return true;
+}
+
+}  // namespace TEST_CASE
+
+template <typename T, typename U, bool isReuseSource = false, bool isInplace = false, bool tmpLocal = false>
+class KernelWelfordUpdate {
+public:
+    __aicore__ inline KernelWelfordUpdate()
+    {}
+    __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR outputMean_gm,
+        GM_ADDR outputVar_gm, int32_t rnLength, int32_t abLength, int32_t abComputeLength, float nRec)
+    {
+        m_rnLength = rnLength;
+        m_abLength = abLength;
+        m_abComputeLength = abComputeLength;
+        m_nRec = nRec;
+        bshLength = rnLength * abLength;
+        inplace = isInplace;
+
+        inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), bshLength);
+        inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength);
+        inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength);
+        outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength);
+        outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength);
+
+        pipe.InitBuffer(inQueueX, 1, bshLength * sizeof(T));
+        pipe.InitBuffer(inQueueMean, 1, bshLength * sizeof(U));
+        pipe.InitBuffer(inQueueVar, 1, bshLength * sizeof(U));
+        pipe.InitBuffer(outQueueMean, 1, bshLength * sizeof(U));
+        pipe.InitBuffer(outQueueVar, 1, bshLength * sizeof(U));
+
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        LocalTensor<T> inputXLocal = inQueueX.AllocTensor<T>();
+        LocalTensor<U> inputMeanLocal = inQueueMean.AllocTensor<U>();
+        LocalTensor<U> inputVarLocal = inQueueVar.AllocTensor<U>();
+
+        DataCopy(inputXLocal, inputX_global, bshLength);
+        DataCopy(inputMeanLocal, inputMean_global, bshLength);
+        DataCopy(inputVarLocal, inputVar_global, bshLength);
+
+        inQueueX.EnQue(inputXLocal);
+        inQueueMean.EnQue(inputMeanLocal);
+        inQueueVar.EnQue(inputVarLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        LocalTensor<T> inputXLocal = inQueueX.DeQue<T>();
+        LocalTensor<U> inputMeanLocal = inQueueMean.DeQue<U>();
+        LocalTensor<U> inputVarLocal = inQueueVar.DeQue<U>();
+
+        LocalTensor<U> outMeanLocal = outQueueMean.AllocTensor<U>();
+        LocalTensor<U> outVarLocal = outQueueVar.AllocTensor<U>();
+
+        AscendC::Duplicate<U>(outMeanLocal, (U)(0.0), bshLength);
+        AscendC::Duplicate<U>(outVarLocal, (U)(0.0), bshLength);
+
+        struct WelfordUpdateParam para = {m_rnLength, m_abLength, m_abComputeLength, m_nRec};
+        if (tmpLocal) {
+            TEST_CASE::GetWelfordUpdateMaxMinTmpSize(m_rnLength, m_abLength, sizeof(T), sizeof(U), isReuseSource,
+                isInplace, tmpMaxBytes, tmpMinBytes);
+            if (tmpMinBytes % WEL_UP_BLOCK_SIZE != 0) {
+                tmpMinBytes = (tmpMinBytes + WEL_UP_BLOCK_SIZE - 1) / WEL_UP_BLOCK_SIZE * WEL_UP_BLOCK_SIZE;
+            }
+            pipe.InitBuffer(tmpLocalBuf, tmpMinBytes);
+            LocalTensor<uint8_t> tmpLocalTensor = tmpLocalBuf.Get<uint8_t>();
+            if (inplace) {
+                WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_ENABLE_INPLACE_CFG>(outMeanLocal, outVarLocal,
+                    inputMeanLocal, inputVarLocal, inputXLocal, tmpLocalTensor, para);
+            } else {
+                WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_UNENABLE_INPLACE_CFG>(outMeanLocal, outVarLocal,
+                    inputMeanLocal, inputVarLocal, inputXLocal, tmpLocalTensor, para);
+            }
+        } else {
+            if (inplace) {
+                WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_ENABLE_INPLACE_CFG>(outMeanLocal, outVarLocal,
+                    inputMeanLocal, inputVarLocal, inputXLocal, para);
+            } else {
+                WelfordUpdate<T, U, isReuseSource, WELFORD_UPDATE_UNENABLE_INPLACE_CFG>(outMeanLocal, outVarLocal,
+                    inputMeanLocal, inputVarLocal, inputXLocal, para);
+            }
+        }
+        
+        outQueueMean.EnQue<U>(outMeanLocal);
+        outQueueVar.EnQue<U>(outVarLocal);
+
+        inQueueX.FreeTensor(inputXLocal);
+        inQueueMean.FreeTensor(inputMeanLocal);
+        inQueueVar.FreeTensor(inputVarLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        LocalTensor<U> outMeanLocal = outQueueMean.DeQue<U>();
+        LocalTensor<U> outVarLocal = outQueueVar.DeQue<U>();
+
+        DataCopy(outputMean_global, outMeanLocal, bshLength);
+        DataCopy(outputVar_global, outVarLocal, bshLength);
+
+        outQueueMean.FreeTensor(outMeanLocal);
+        outQueueVar.FreeTensor(outVarLocal);
+    }
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, 1> inQueueX;
+    TQue<QuePosition::VECIN, 1> inQueueMean;
+    TQue<QuePosition::VECIN, 1> inQueueVar;
+    TQue<QuePosition::VECOUT, 1> outQueueMean;
+    TQue<QuePosition::VECOUT, 1> outQueueVar;
+
+    GlobalTensor<T> inputX_global;
+    GlobalTensor<U> inputMean_global;
+    GlobalTensor<U> inputVar_global;
+    GlobalTensor<U> outputMean_global;
+    GlobalTensor<U> outputVar_global;
+    TBuf<TPosition::VECCALC> tmpLocalBuf;
+
+    uint32_t m_rnLength;
+    uint32_t m_abLength;
+    uint32_t m_abComputeLength;
+    float m_nRec;
+    uint32_t bshLength;
+    bool inplace;
+    uint32_t tmpMinBytes = 0;
+    uint32_t tmpMaxBytes = 0;
+};
+
+template <typename T, typename U, bool isReuseSource = false, bool isFullwelfordUpdate = false, bool tmpLocal = false>
+__aicore__ void main_WelfordUpdate_test(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm, GM_ADDR outMeanGm,
+    GM_ADDR outVarGm, int32_t rnLength, int32_t abLength, int32_t abComputeLength, float nRec)
+{
+    KernelWelfordUpdate<T, U, isReuseSource, isFullwelfordUpdate, tmpLocal> op;
+    op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, rnLength, abLength, abComputeLength, nRec);
+    op.Process();
+}
+
+struct WelfordUpdateTestParams {
+    int64_t rnLength;
+    int64_t abLength;
+    int64_t abComputeLength;
+    float nRec;
+    uint32_t TypeSizeT;
+    uint32_t TypeSizeU;
+    void (*calFunc)(uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int32_t, int32_t, int32_t, float);
+};
+
+class WelfordUpdateTestSuite : public testing::Test, public testing::WithParamInterface<WelfordUpdateTestParams> {
+protected:
+    static void SetUpTestCase()
+    {
+        std::cout << "WelfordUpdateTestSuite SetUpTestCase" << std::endl;
+    }
+    static void TearDownTestCase()
+    {
+        std::cout << "WelfordUpdateTestSuite TearDownTestCase" << std::endl;
+    }
+    virtual void SetUp()
+    {}
+    virtual void TearDown()
+    {}
+};
+
+INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_WelfordUpdate, WelfordUpdateTestSuite,
+    ::testing::Values(
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, true, true, true> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, true, true, false> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, true, false, true> },
+    WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, false, true, true> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, true, false, false> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, false, true, false> },
+    WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, false, false, true> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test<half, float, false, false, false> },
+
+    WelfordUpdateTestParams { 1, 8, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, true, true, true> },
+    WelfordUpdateTestParams { 1, 16, 13, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, true, true, false> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, true, false, true> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, false, true, true> },
+    WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, true, false, false> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, false, true, false> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, false, false, true> },
+    WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test<float, float, false, false, false> }
+));
+
+TEST_P(WelfordUpdateTestSuite, WelfordUpdateTestCase)
+{
+    auto param = GetParam();
+    uint32_t srcSize = param.rnLength * param.abLength;
+    uint8_t srcGm[srcSize * param.TypeSizeT]{0x00};
+    uint8_t inMeanGm[srcSize * param.TypeSizeU]{0x00};
+    uint8_t inVarGm[srcSize * param.TypeSizeU]{0x00};
+    uint8_t outMeanGm[srcSize * param.TypeSizeU]{0x00};
+    uint8_t outVarGm[srcSize * param.TypeSizeU]{0x00};
+    param.calFunc(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, param.rnLength, param.abLength, param.abComputeLength,
+        param.nRec);
+    for (int32_t i = 0; i < srcSize; i++) {
+        EXPECT_EQ(outMeanGm[i], 0x00);
+        EXPECT_EQ(outVarGm[i], 0x00);
+    }
+}
+
diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp
index 4568f444de0ceaa2320289fbc44f066e51e8ebb3..936a7180416dc940153de54abffa8d53f8aeb6f4 100644
--- a/tests/tiling/test_tiling.cpp
+++ b/tests/tiling/test_tiling.cpp
@@ -3473,6 +3473,38 @@ TEST_F(TestTiling, tiling_compute_error)
     EXPECT_EQ(ret, -1);
 }
 
+TEST_F(TestTiling, TestWelfordUpdateTiling)
+{
+    std::vector<int64_t> shapeDims1d = {1, 128};
+    auto shape1d = ge::Shape(shapeDims1d);
+    uint32_t maxsize = 0;
+    uint32_t minsize = 0;
+    uint32_t dtypesizeT = 2;  // half类型
+    uint32_t dtypesizeU = 4;  // float类型
+    bool isReuseSource = false;
+    GetWelfordUpdateMaxMinTmpSize(shape1d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize);
+    EXPECT_EQ(minsize, 3 * 256);
+    EXPECT_EQ(maxsize, 2 * 3 * 256);
+
+    std::vector<int64_t> shapeDims2d = {1, 72};
+    auto shape2d = ge::Shape(shapeDims2d);
+    dtypesizeT = 4;  // float类型
+    dtypesizeU = 4;  // float类型
+    isReuseSource = false;
+    GetWelfordUpdateMaxMinTmpSize(shape2d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize);
+    EXPECT_EQ(minsize, 2 * 256);
+    EXPECT_EQ(maxsize, 2 * 2 * 256);
+
+    std::vector<int64_t> shapeDims3d = {1, 256};
+    auto shape3d = ge::Shape(shapeDims3d);
+    dtypesizeT = 4;  // float类型
+    dtypesizeU = 4;  // float类型
+    isReuseSource = true;
+    GetWelfordUpdateMaxMinTmpSize(shape3d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize);
+    EXPECT_EQ(minsize, 1 * 256);
+    EXPECT_EQ(maxsize, 4 * 1 * 256);
+}
+
 TEST_F(TestTiling, TestNZFp32UnalignedK)
 {
     matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288,
diff --git a/tests/utils/init_global_memory/test_operator_init_global_memory.cpp b/tests/utils/init_global_memory/test_operator_init_global_memory.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c3cdd892e730b6199eb1e5729b69497fb391f95
--- /dev/null
+++ b/tests/utils/init_global_memory/test_operator_init_global_memory.cpp
@@ -0,0 +1,59 @@
+/**
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <gtest/gtest.h>
+#include "kernel_operator.h"
+
+using namespace std;
+using namespace AscendC;
+
+class TEST_INIT_GLOBAL_MEMORY : public testing::Test {
+protected:
+    void SetUp()
+    {
+        AscendC::SetGCoreType(2);
+    }
+    void TearDown()
+    {
+        AscendC::SetGCoreType(0);
+    }
+};
+
+template <typename T>
+void main_init_global_memory_demo(__gm__ uint8_t *__restrict__ dst_gm, const uint64_t dataSize)
+{
+    TPipe tpipe;
+    GlobalTensor<T> dst_global;
+    dst_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(dst_gm), dataSize);
+    InitGlobalMemory(dst_global, dataSize, (T)10);
+    pipe_barrier(PIPE_ALL);
+}
+#define VEC_INIT_GLOBAL_MEMORY_TESTCASE(DATASIZE, DATA_TYPE)                                    \
+    TEST_F(TEST_INIT_GLOBAL_MEMORY, INIT_GLOBAL_MEMORY##_##DATASIZE##_##DATA_TYPE##_##Case)     \
+    {                                                                                           \
+        uint8_t output_gm[DATASIZE * sizeof(DATA_TYPE)] = {0};                                  \
+        main_init_global_memory_demo<DATA_TYPE>(output_gm, DATASIZE);                           \
+        for (uint32_t i = 0; i < DATASIZE; i++) {                                               \
+                EXPECT_EQ(output_gm[i], 0x00);                                                  \
+        }                                                                                       \
+    }
+
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, half);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, half);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, float);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, float);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, uint16_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, uint16_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, int16_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, int16_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, uint32_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, uint32_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, int32_t);
+VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, int32_t);
\ No newline at end of file

算子类型(OpType)	WelfordUpdateCustom
算子输入
	name	shape	data type	format
	srcGm	1*64	half	ND
	inMeanGm	1*64	half	ND
	inVarGm	1*64	half	ND
算子输出
	outMeanGm	1*64	float	ND
	outVarGm	1*64	float	ND
核函数名	welford_update_custom