diff --git a/atvc/README.md b/atvc/README.md
index 4fe55d6c4f8b6d1c04ac826a75964e7dc767a419..c7f47b655b1899d7a8a1e852235597b695bb6c6e 100644
--- a/atvc/README.md
+++ b/atvc/README.md
@@ -56,8 +56,8 @@ git clone https://gitee.com/ascend/ascendc-api-adv.git
 
 - 执行add用例
 ```bash
-$ cd ./atvc/tests
-$ bash run_test.sh add
+$ cd ./atvc/examples
+$ bash run_examples.sh add
 ...
 Generate golden data successfully.
 ...
@@ -74,18 +74,17 @@ Accuracy verification passed.
 # 样例介绍
 | 样例名                                                         | 描述                                                         | 类型|
 | ------------------------------------------------------------ | ------------------------------------------------------------ |------------------------------------------------------------ |
-| [add](./examples/add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例               | 直调算子 |
-| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例               | 直调算子 |
-| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例              | 直调算子 |
-| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例               | 直调算子 |
-| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例               | 直调算子 |
-| [tanh_grad](./examples/tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例              | 直调算子 |
-| [ops_aclnn](./examples/ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例               | 自定义工程算子 |
-| [ops_pytorch](./examples/ops_pytorch) | 使用ATVC基于[pytorch](https://gitee.com/ascend/pytorch)算子的实现以及调用样例                 | pytorch算子 |:wq
-| [add_with_broadcast](./examples/add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例                  | 直调算子 |
-
-更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md)，其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to、tanh_grad、add_with_broadcast是ATVC的直调样例，ops_aclnn为基于ATVC对接aclnn工程的算子目录，ops_pytorch为基于ATVC对接pytorch工程的算子目录。其中，ops_aclnn和ops_pytorch样例需要进入到example路径下按照README.md描述执行。
-
+| [add](./examples/add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例               | Kernel直调 |
+| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例               | Kernel直调 |
+| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例              | Kernel直调 |
+| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例               | Kernel直调 |
+| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例               | Kernel直调 |
+| [tanh_grad](./examples/tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例              | Kernel直调 |
+| [ops_aclnn](./examples/ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例               | 单算子API调用 |
+| [ops_pytorch](./examples/ops_pytorch) | 使用ATVC开发自定义算子，并实现从[PyTorch](https://gitee.com/ascend/pytorch)框架调用的样例                 | PyTorch框架调用 |
+| [add_with_broadcast](./examples/add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例                  | Kernel直调 |
+
+更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/01_quick_start.md)，其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to、tanh_grad、add_with_broadcast是ATVC的直调样例，ops_aclnn为基于ATVC对接aclnn工程的算子目录，ops_pytorch为基于ATVC对接PyTorch工程的算子目录。其中，ops_aclnn和ops_pytorch样例需要进入到example路径下按照README.md描述执行。
 
 
 # 支持场景
diff --git a/atvc/docs/01_quick_start.md b/atvc/docs/01_quick_start.md
index 51c24a860f7cf3a2edd3113d526045d6843be990..066a14b4bf2f51cad52d95f0fff4fe8a4303133d 100644
--- a/atvc/docs/01_quick_start.md
+++ b/atvc/docs/01_quick_start.md
@@ -114,23 +114,23 @@ AddCustom<ADD_OPTRAITS><<<blockNum, nullptr, stream>>>(aDevice, bDevice, cDevice
 ## 算子编译&执行
 完成算子代码编写后，调用以下命令编译代码并执行：
 ```bash
-cd ./atvc/tests/
-bash run_test.sh add
+$ cd ./atvc/examples
+$ bash run_examples.sh add
 ```
 
 其他样例执行命令如下：
 ```bash
-bash run_test.sh sinh_custom      # 执行sinh_custom样例
-bash run_test.sh reduce_sum       # 执行reduce_sum样例
-bash run_test.sh add_with_scalar  # 执行add_with_scalar样例
-bash run_test.sh broadcast_to     # 执行broadcast样例
+bash run_examples.sh sinh_custom      # 执行sinh_custom样例
+bash run_examples.sh reduce_sum       # 执行reduce_sum样例
+bash run_examples.sh add_with_scalar  # 执行add_with_scalar样例
+bash run_examples.sh broadcast_to     # 执行broadcast样例
 ```
 
 ## 完整样例<br>
 完整代码样例请参照[examples/add/add.cpp](../examples/add/add.cpp)
 
 # 模板选择
-用户根据待开发的Vector算子定义特征，选择以下三种匹配的模板及其配套的tiling算法，若自定义算子不在当前模板库的范围内，建议使用基本Ascend C API 手写算子。
+用户根据待开发的Vector算子定义特征，选择以下三种匹配的模板及其配套的tiling算法，若自定义算子不在当前模板库的范围内，建议使用Ascend C API 手写算子。
 ## Elementwise类算子
 Elementwise类算子通常是指对张量进行元素级别的操作的函数或方法，包括但不限于加、减、乘、除及指数、对数、三角函数等数学函数。这类算子的特点是会逐元素进行计算操作，而不会改变输入数据的形状。常见的Elementwise算子有Add、Sub、Exp、Log、Sin、Sqrt等。
 ## Reduce类算子
diff --git a/atvc/docs/02_developer_guide.md b/atvc/docs/02_developer_guide.md
index c24e416bcf26965c4914e6b598ff423e78b2597d..a7de01196286dcfa65ff95cf8890c06aef6ccaf8 100644
--- a/atvc/docs/02_developer_guide.md
+++ b/atvc/docs/02_developer_guide.md
@@ -297,184 +297,7 @@ if (!ATVC::Host::CalcEleWiseTiling<SinhOpTraits>(eleNum, param)) {
 
 
 ### 2.1.6 完整样例
-通过ATVC框架实现的完整SinhCustom算子定义&调用[代码](../examples/sinh_custom/sinh_custom.cpp)如下：
-
-```cpp
-#include <vector>
-#include <cstdint>
-#include <cmath>
-#include <random>
-#include <iostream>
-#include <algorithm>
-#include "acl/acl.h"
-#include "elewise/elewise_host.h"
-#include "elewise/elewise_device.h" 
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
-
-namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
-// 描述算子的输入输出以及临时计算资源
-using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
-
-// 传入编译态参数ATVC::OpTraits
-template<class Traits>
-// 开发自定义函数名/类名
-struct SinhComputeFunc {
-    // DataType模板参数，根据实际数据类型个数填写
-    template<typename T, typename U>
-    // 重载operator公有接口，提供给`ATVC::Kernel::EleWiseOpTemplate`调用
-    __aicore__ inline void operator()(AscendC::LocalTensor<T> x,
-                                      AscendC::LocalTensor<T> y,
-                                      AscendC::LocalTensor<U> tempBuffer1,
-                                      AscendC::LocalTensor<U> tempBuffer2)
-    {
-        // 开发调用AscendC Api自行实现计算仿函数
-        uint32_t tiledCnt = y.GetSize(); // 进行单次基块计算的元素个数
-        AscendC::Muls(tempBuffer1, x, static_cast<T>(-1), tiledCnt); // tempBuffer1 = -1 * x
-        AscendC::Exp(tempBuffer1, tempBuffer1, tiledCnt); // tempbuffer1 = exp(-x)
-        AscendC::Exp(tempBuffer2, x, tiledCnt);  // tempbuffer2 = exp(x)
-        AscendC::Sub(y, tempBuffer2, tempBuffer1, tiledCnt); // y = exp(x) - exp(-x)
-        AscendC::Muls(y, y, static_cast<T>(0.5), tiledCnt); // y = (e^(x) - e^(-x)) / 2 
-    }
-};
-
-void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<float> &golden)
-{
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(1.0f, 10.0f);
-
-    for (int i = 0; i < eleNum; ++i) {
-        inputX[i] = dis(gen);
-        golden[i] = std::sinh(inputX[i]);
-    }
-}
-
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-}
-
-/*
- * 该函数为SinhCustom算子核函数入口
- * x        Device上的gm地址，指向SinhCustom算子第一个输入
- * y        Device上的gm地址，指向SinhCustom算子第一个输出
- * param    指向运行态ATVC::EleWiseParam数据
-*/
-template<class OpTraits>
-__global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::EleWiseParam param)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 控制算子执行时只启动Vector核
-    auto op = ATVC::Kernel::EleWiseOpTemplate<SinhComputeFunc<OpTraits>>();
-    op.Run(x, y, &param);          // 按照输入、输出、param的顺序传入Run函数中；OpTraits内部的ATVC::OpTemps将由EleWiseOpTemplate内部申请资源，开发无需关注
-}
-
-int main()
-{
-    // init data
-    int32_t eleNum = 8 * 2048;
-    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-    size_t outputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-
-    std::vector<float> inputX(eleNum);
-    std::vector<float> golden(eleNum);
-    InitializeData(eleNum, inputX, golden);
-
-    ATVC::EleWiseParam param;
-
-    // 计算输入为8*2048个float元素的sinh算子的运行态参数param
-    if (!ATVC::Host::CalcEleWiseTiling<SinhOpTraits>(eleNum, param)) {
-        printf("Elewise tiling error.");
-        return -1;
-    };
-    // 初始化Acl资源与数据
-    aclrtContext context;
-    aclrtStream stream = nullptr;
-    int32_t deviceId = 0;
-    InitializeACL(context, stream, deviceId);
-
-    uint8_t *yHost;
-    uint8_t *xDevice;
-    uint8_t *yDevice;
-    uint8_t *paramDevice;
-    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
-    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    // 将tiling计算的运行时参数EleWiseParam param传到Device侧
-    auto elementParamSize = sizeof(param);
-    CHECK_ACL(aclrtMalloc((void**)&paramDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize,
-                          reinterpret_cast<uint8_t *>(&param), elementParamSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
-
-    // 调用自定义的Kernel API, <<<>>>的BlockNum参数可通过param的TilingData获取
-    SinhCustom<SinhOpTraits><<<param.tilingData.blockNum, nullptr, stream>>>(xDevice, yDevice, paramDevice);
-
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + eleNum);
-
-    // 释放资源
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFree(paramDevice));
-    CHECK_ACL(aclrtFreeHost(yHost));
-
-    CleanACL(stream, deviceId);
-
-    if (!VerifyResults(golden, outputY)) {
-        return -1;
-    }
-    printf("Accuracy verification passed.\n");
-    return 0;
-}
-```
+通过ATVC框架实现的完整SinhCustom算子定义&调用，完整样例代码见链接[样例代码](../examples/sinh_custom/sinh_custom.cpp)
 
 ## 2.2 Reduce算子开发
 ATVC框架提供的Reduce算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
@@ -546,10 +369,10 @@ Reduce计算模板类将在数据计算阶段被`ReduceOpTemplate`算子模板
 
 - 该模板类在实例化时固定传入ATVC::OpTraits类型的结构体作为模板参数，如`ATVC::OpTraits<ATVC::OpInputs<float>,ATVC::OpOutputs<float>`。
 - 开发必须完成以下公有API的内部实现：
-    1. 计算单数据基块的Reduce结果 `\_\_aicore\_\_ inline void Compute(...)`
-    2. 计算单UB内不同数据基块的计算结果 `\_\_aicore\_\_ inline void UpdateCache(...)`
-    3. 计算多核之间&同一核内的多次UB结果 `\_\_aicore\_\_ inline void ReduceBetweenUB(...)`
-    4. 返回非对齐场景不参与计算的尾部数据的填充值 `\_\_aicore\_\_ inline U GetPaddingValue()`
+    1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)`
+    2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)`
+    3. 计算多核之间&同一核内的多次UB结果 `__aicore__ inline void ReduceBetweenUB(...)`
+    4. 返回非对齐场景不参与计算的尾部数据的填充值 `__aicore__ inline U GetPaddingValue()`
 
 
 ### 2.2.3 内置Reduce算子模板
@@ -765,203 +588,7 @@ using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<flo
 
 
 ### 2.2.6 完整样例
-通过ATVC框架实现的完整ReduceSumCustom算子[样例代码](../examples/reduce_sum/reduce_sum.cpp)如下：
-```cpp
-#include <vector>
-#include <cstdint>
-#include <cmath>
-#include <random>
-#include <iostream>
-#include <algorithm>
-#include "acl/acl.h"
-#include "reduce/reduce_host.h"
-#include "reduce/reduce_device.h"
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
-
-namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
-// ReduceSum算子的描述：一个输入，一个输出，类型均为float
-using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
-
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-            if (!IsClose(golden[i], output[i])) {
-                printf("Accuracy verification failed! The expected value of element "
-                       "in index [%d] is %f, but actual value is %f.\n",
-                    i,
-                    golden[i],
-                    output[i]);
-                return false;
-            }
-    }
-    return true;
-}
-}
-
-/*
- * 该函数为ReduceCustom算子核函数入口
- * x                 Device上的gm地址，指向ReduceCustom算子第一个输入
- * y                 Device上的gm地址，指向ReduceCustom算子第一个输出
- * reduceParam       指向运行态ATVC::ReduceParam数据
-*/
-template<typename Traits, const auto& Policy>
-__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
-    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
-    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy>();
-    op.Run(x, y, &reduceParam);
-}
-
-// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
-template<class OpTraits>
-void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam &param, ATVC::ReducePolicy &policy, aclrtStream& stream)
-{
-    // 申请临时空间workspace，并将其与ReduceTilingData一同传到Device侧
-    uint8_t *paramDevice;
-    uint8_t *workspaceDevice;
-    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
-    auto reduceParamSize = sizeof(param);
-    CHECK_ACL(aclrtMalloc((void**)&paramDevice, reduceParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(paramDevice, reduceParamSize,
-                          reinterpret_cast<uint8_t *>(&param), reduceParamSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
-    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
-    if (policy == ATVC::REDUCE_POLICY0) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY1) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY2) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY3) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY4) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY5) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY6) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY7) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY8) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY9) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY10) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY11) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY12) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY13) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY14) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY15) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY16) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY17) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY18) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY19) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY20) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY21) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else if (policy == ATVC::REDUCE_POLICY22) {
-        ReduceCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else {
-        printf("[ERROR] Cannot find any matched policy.\n");
-    }
-    // 流同步后释放申请的param内存
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-    CHECK_ACL(aclrtFree(workspaceDevice));
-    CHECK_ACL(aclrtFree(paramDevice));
-}
-
-int32_t main(int32_t argc, char* argv[])
-{
-    int32_t eleNum = 8 * 1024;
-    int32_t outEleNum = 1 * 1024;
-    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
-    std::vector<int64_t> dim{0};            // 对第0轴执行reduce操作
-    std::vector<int64_t> shape{8, 1024};    // 测试输入shape
-    std::vector<float> inputX(eleNum, 1.0f);
-    std::vector<float> golden(outEleNum, 8.0f);
-    printf("Generate golden data successfully.\n");
-    // 初始化Acl资源
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-    uint8_t *yHost;
-    uint8_t *xDevice;
-    uint8_t *yDevice;
-
-    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
-    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
-    // Host侧调用Tiling API完成相关运行态参数的运算
-    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param)) {
-        printf("Reduce tiling error.\n");
-        return -1;
-    };
-
-    // 调用Adapter调度接口，完成核函数的模板调用
-    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
-
-    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
-
-    // 释放Acl资源
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFreeHost(yHost));
-
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-
-    if (!VerifyResults(golden, outputY)) {
-        return -1;
-    }
-
-    printf("Accuracy verification passed.\n");
-    return 0;
-}
-```
+通过ATVC框架实现的完整ReduceSum算子定义&调用，完整样例代码见链接[样例代码](../examples/reduce_sum/reduce_sum.cpp)
 
 ## 2.3 Broadcast算子开发
 ATVC框架提供的Broadcast算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
@@ -1242,145 +869,7 @@ int32_t main(int32_t argc, char* argv[])
 
 
 ### 2.3.6 完整样例
-通过ATVC框架实现的完整BroadcastCustom算子[样例代码](../examples/broadcast_to/broadcast_to.cpp)
-```cpp
-#include <vector>
-#include <cstdint>
-#include <cmath>
-#include <random>
-#include <iostream>
-#include <algorithm>
-#include "acl/acl.h"
-#include "atvc.h"
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
-
-namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b) {
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-}
-
-// BroadcastTo算子的描述：一个输入，一个输出，类型均为float
-using BroadcastOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
-
-/*
- * 该函数为BroadcastCustom算子核函数入口
- * x                 Device上的gm地址，指向BroadcastTo算子第一个输入
- * y                 Device上的gm地址，指向BroadcastTo算子第一个输出
- * broadcastParam    指向运行态ATVC::BroadcastParam数据
-*/
-template <typename Traits, const auto& Policy>
-__global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam broadcastParam)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 设置算子执行时只启动Vector核
-    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
-    auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<Traits>, Policy>();
-    op.Run(x, y, &broadcastParam);
-}
-
-
-// 负责Broadcast类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
-template <class OpTraits>
-void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam &param, ATVC::BroadcastPolicy &policy, aclrtStream& stream)
-{
-    // 申请临时空间workspace，并将其与BroadcastTilingData一同传到Device侧
-    uint8_t *paramDevice;
-    uint8_t *workspaceDevice;
-    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    param.workspaceAddr = reinterpret_cast<uint64_t>(workspaceDevice);
-    auto broadcastParamSize = sizeof(param);
-    CHECK_ACL(aclrtMalloc((void**)&paramDevice, broadcastParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(paramDevice, broadcastParamSize, reinterpret_cast<uint8_t*>(&param), broadcastParamSize, ACL_MEMCPY_HOST_TO_DEVICE));
-    // 将tiling api计算出的BroadcastPolicy转化为编译态参数并实例化相应的核函数
-    if (policy == ATVC::BROADCAST_POLICY0) {
-        BroadcastCustom<OpTraits, ATVC::BROADCAST_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-     }else if (policy == ATVC::BROADCAST_POLICY1) {
-        BroadcastCustom<OpTraits, ATVC::BROADCAST_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, paramDevice);
-    } else {
-        printf("[ERROR] Cannot find any matched policy.\n");
-    }
-    // 流同步后释放申请的param内存
-    CHECK_ACL(aclrtSynchronizeStream(stream));
-    CHECK_ACL(aclrtFree(workspaceDevice));
-    CHECK_ACL(aclrtFree(paramDevice));
-}
-
-int32_t main(int32_t argc, char* argv[])
-{
-    int32_t eleNum = 1 * 1024;
-    int32_t outEleNum = 8 * 1024;
-    size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-    size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
-    std::vector<int64_t> shapeIn{1, 1024};    // 测试输入shape
-    std::vector<int64_t> shapeOut{8, 1024};    // 测试输入shape
-    std::vector<float> inputX(eleNum, 1.0f);
-    std::vector<float> golden(outEleNum, 1.0f);
-    printf("Generate golden data successfully.\n");
-    // 初始化Acl资源
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
-    uint8_t *yHost;
-    uint8_t *xDevice;
-    uint8_t *yDevice;
-
-    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
-    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
-    // Host侧调用Tiling API完成相关运行态参数的运算
-    if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
-        printf("Broadcast tiling error.\n");
-        return -1;
-    };
-
-    // 调用Adapter调度接口，完成核函数的模板调用
-    BroadcastOpAdapter<BroadcastOpTraits>(xDevice, yDevice, param, policy, stream);
-
-    CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    std::vector<float> outputY(reinterpret_cast<float*>(yHost), reinterpret_cast<float*>(yHost) + outEleNum);
-
-    // 释放Acl资源
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFreeHost(yHost));
-
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-
-    for (int32_t i = 0; i < outEleNum; i++) {
-        if (!IsClose(golden[i], outputY[i])) {
-            printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputY[i]);
-            return -1;
-        }
-    }
-    printf("Accuracy verification passed.\n");
-    return 0;
-}
-```
+通过ATVC框架实现的完整BroadcastCustom算子定义&调用，完整样例代码见链接[样例代码](../examples/broadcast_to/broadcast_to.cpp)
 
 ## 2.4 组合算子开发
 ATVC框架支持Broadcast与Elementwise组合的算子通过扩展BroadcastOpTemplate的模板参数对用户提供接口，开发者可以根据算子实际需求来定制组合，框架支持以下组合：Elementwise + Broadcast、Broadcast + Elementwise、Elementwise + Broadcast + Elementwise。下面以Broadcast与Elementwise组合为例进行详细讲解。
@@ -1600,7 +1089,6 @@ ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &
 ### 2.4.6 完整样例
 参考sample [add_with_broadcast](../examples/add_with_broadcast/README.md)
 
-
 # 3 ATVC的调试调优功能
 为了用户在使用ATVC进行算子开发时能快速进行精度调试和性能调优，ATVC支持多种调试调优能力。
 ## 3.1 OpTraits校验接口
@@ -1622,6 +1110,13 @@ enum class TemplateType {
     BROADCAST,  // Broadcast模板的校验类型
 };
 ```
+DebugCheck主要校验项如下:
+|      模板类型     |                   OpTraits校验项                    |
+|   -----------     |                  ---------------                    |
+|      ELE_WISE     | 输入输出非空                                        |
+|       REDUCE      | 输入输出个数均为1, 输入输出数据类型相同              |
+|     BROADCAST     | 输入输出个数均为1, 输入输出数据类型相同              |
+
 接口使用示例:
 ```cpp
 using AddOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
@@ -1630,13 +1125,13 @@ ATVC::Host::DebugCheck<AddOpTraits, ATVC::TemplateType::ELE_WISE>();
 ```
 完整的DebugCheck调用接口样例可参考tanh_grad算子[样例代码](../examples/tanh_grad/tanh_grad.cpp)。
 ## 3.2 使用调试调优模式运行算子
-样例执行脚本run_test.sh支持可选入参`--run-mode`进行不同调试调优运行模式的选择。
+样例执行脚本run_examples.sh支持可选入参`--run-mode`进行不同调试调优运行模式的选择。
 当前支持`debug_print`和`profiling`两种模式。
 - `--run-mode=debug_print`：DFX信息打印模式，打开kernel侧的模板内置关键节点的信息打印和异常退出时的打印功能。
 - `--run-mode=profiling`：Profiling性能采集模式，运行时打开profiling性能数据采集功能。
 - 未设置`--run-mode`：默认模式，正常上板，无kernel侧的dfx信息打印， 未开启profiling性能采集功能。
 ## 3.2.1 DFX信息打印模式
-通过运行run_test.sh脚本时加上可选参数`--run-mode=debug_print`打开本功能。
+通过运行run_examples.sh脚本时加上可选参数`--run-mode=debug_print`打开本功能。
 DFX信息打印格式按照 [日志级别(`ERROR`/`INFO`)]:[`ATVC`][`Module`](可选：[`CopyIn`/`CopyOut`等])的标准进行打印。
 - 日志级别： ERROR是异常打印信息，INFO是模板内部重要信息打印
 - `ATVC`： 标识是ATVC模板库内置的DFX信息打印
@@ -1652,12 +1147,12 @@ __aicore__ inline void DebugPrintf(__gm__ const char* fmt, Args&&... args);
 }
 }
 // 调用示例
-ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n");
-ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_);
+ATVC::Kernel::DebugPrintf("[ERROR]:[ATVC][EleWise] Input Count can not be 0!\n");
+ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_);
 ```
 
 ## 3.2.2 开启Profiling性能调优功能
-通过运行run_test.sh脚本时加上可选参数`--run-mode=profiling`打开本功能。
+通过运行run_examples.sh脚本时加上可选参数`--run-mode=profiling`打开本功能。
 为了增加Profiling采集性能数据的稳定性，建议用户在开启profiling时，运行时重复多次调用kernel，可实现一次性采集多次上板的性能数据，消除抖动。
 ```cpp
 TanhGrad<AddOpTraits><<<blockNum, nullptr, stream>>>(dyDevice, yDevice, zDevice, paramDevice);
@@ -1698,7 +1193,8 @@ struct EleWiseTilingData {
 | ----------- | -------------- | ----------- | ----------- |---|
 | singleCoreBaseLine | uint32_t | 单核数据量基线 |  [256, 128 * 1024] | 512|
 | ubSizeLimitThreshold | float  | UB内存使用上限，决定了basicBlock最大值 | [0.5, 0.96] | 0.95 |
-| splitDataShape | uint32_t[3]| 单核内数据量的3个分段节点，表示数据量分为4段| {node_0, node_1, node_2} | {1024, 32*1024, 64*1024}|
+| nBufferNum | uint32_t  | 用于并行流水的buffer数量 | [1, 2] | 2 |
+| splitDataShape | uint32_t[3]| 单核内数据量的3个分段节点，表示数据量分为4段| {node_0, node_1, node_2} | {1024, 32 * 1024, 64 * 1024}|
 | dataSplitFactor | uint32_t[4]| 单核内4个数据段的切分系数, 决定不同数据段的切分基本块的大小| {factor_0, factor_1, factor_2, factor_3} 均需在范围[1, 32]| {4, 4, 8, 6}|
 | rsvLiveCnt | uint32_t| 预留的空间大小为rsvLiveCnt * (inputBuffer + outputBuffer)|[0, 1]| 0|
 
@@ -1775,7 +1271,7 @@ struct ReduceTilingData {
 | ----------- | -------------- | ----------- | ----------- |---|
 | basicBlock | uint32_t | Reduce 基本块内存大小 | 不能超过UB内存的1/3， 192K内存 建议在48K-54K之间设置 | 54 * 1024|
 | maxInnerA | uint32_t |AR切轴内A轴的最大数据量 | [128, 256] | 128 |
-| balanceThreshHold | double| 多核均衡的阈值水平, 阈值越高，切分后每个核处理的数据量越均衡 | [0.85, 0.95]| 0.95 |
+| balanceThreshHold | double| 多核均衡的阈值水平, 阈值越高，切分后每个核处理的数据量越均衡 | [0.8, 0.95]| 0.85 |
 
 对应的超参`ReduceTilingHyperParam`结构定义如下：
 ```cpp
diff --git a/atvc/docs/03_code_organization.md b/atvc/docs/03_code_organization.md
index 47641b0cb98d098b19a918a1145b5d546ade92a2..f6f60e5f7fa51e794c4cf2fd48fb4f98c714c570 100644
--- a/atvc/docs/03_code_organization.md
+++ b/atvc/docs/03_code_organization.md
@@ -32,24 +32,37 @@ include/
 examples文件夹下提供了算子代码样例，包含算子实现的源码文件和测试用例配置和执行脚本。
 ```
 examples
+├── run_examples.sh                             // 执行脚本   
 ├── add                                         // EleWise Add算子样例
 │   ├── README.md
 │   └── add.cpp
 ├── add_with_scalar                             // EleWise + Scalar算子样例
 │   ├── README.md
 │   └── add_with_scalar.cpp
+├── add_with_broadcast                          // BroadcastTo + EleWise算子样例
+│   ├── README.md
+│   └── add_with_scalar.cpp
 ├── broadcast_to                                // BroadcastTo算子样例
 │   ├── README.md
 │   └── broadcast_to.cpp
 ├── reduce_sum                                  // ReduceSum算子样例
 │   ├── README.md
 │   └── reduce_sum.cpp
-└── sinh_custom                                 // SinhCustom算子样例
-    ├── README.md
-    └── sinh_custom.cpp
-└── tanh_grad                                   // Tah_Grad算子样例
-    ├── README.md
-    └── tanh_grad.cpp
+├── sinh_custom                                 // SinhCustom算子样例
+│   ├── README.md
+│   └── sinh_custom.cpp
+├── tanh_grad                                   // TanhGrad算子样例
+│   ├── README.md
+│   └── tanh_grad.cpp
+├── ops_aclnn                                   // 单算子API调用样例
+|   ├── README.md
+│   ├── add
+│   └── reduce_sum
+├── ops_pytorch                                 // PyTorch框架调用样例
+│   ├── README.md
+│   ├── add
+│   └── reduce_sum
+└── common                                      // 算子样例公共接口
 ```
 
 ## 3. docs
diff --git a/atvc/docs/images/broadcast_fusion_dataflow.png b/atvc/docs/images/broadcast_fusion_dataflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..7785bedf67ea053ddac9b1f636c5768e583dd4f3
Binary files /dev/null and b/atvc/docs/images/broadcast_fusion_dataflow.png differ
diff --git a/atvc/examples/add/README.md b/atvc/examples/add/README.md
index ae40bdae244485e50549b70a9482384d907926fe..532af72d4c8ae159fd862f46e6b092e36acab23e 100644
--- a/atvc/examples/add/README.md
+++ b/atvc/examples/add/README.md
@@ -38,8 +38,8 @@ Add算子规格：
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests/
-$ bash run_test.sh add
+$ cd ./atvc/examples
+$ bash run_examples.sh add
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp
index a3f1e65443f1c0bd3551f573f1cd49df4063fd9b..863413a7be47e47c7e30099afaa90d1b2d9ab3ca 100644
--- a/atvc/examples/add/add.cpp
+++ b/atvc/examples/add/add.cpp
@@ -17,27 +17,9 @@
 #include "acl/acl.h"
 #include "elewise/elewise_host.h"
 #include "elewise/elewise_device.h" 
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<float> &inputY, std::vector<float> &golden)
 {
     std::random_device rd;
@@ -51,37 +33,6 @@ void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<floa
     }
 }
 
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("[ERROR]: Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-
 void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice)
 {
     CHECK_ACL(aclrtFree(xDevice));
diff --git a/atvc/examples/add_with_broadcast/README.md b/atvc/examples/add_with_broadcast/README.md
index f49500d66f3d7b495549cdb9d09bb3530860e097..cb3620c084db824358dddd3df0df5250665e846f 100644
--- a/atvc/examples/add_with_broadcast/README.md
+++ b/atvc/examples/add_with_broadcast/README.md
@@ -38,8 +38,8 @@ Add算子规格：
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests/
-$ bash run_test.sh add_with_broadcast
+$ cd ./atvc/examples
+$ bash run_examples.sh add_with_broadcast
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp
index 2a58d49b93040a9091e931da49242e21ad4cbb77..5ae9b3a2f6f940ff9d19abd8b0ff5675580dfd36 100644
--- a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp
+++ b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp
@@ -15,29 +15,12 @@
 #include <random>
 #include <iostream>
 #include <algorithm>
-#include "acl/acl.h"
 #include "broadcast/broadcast_host.h"
+#include "example_common.h"
+#include "acl/acl.h"
 #include "add_with_broadcast.h"
 
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
-
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b) {
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 // AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
 using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
 
@@ -64,21 +47,6 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, uint8_t* z, ATVC::BroadcastParam
     CHECK_ACL(aclrtDestroyStream(stream));
     CHECK_ACL(aclrtFree(workspaceDevice));
 }
-
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
 }
 
 
@@ -92,7 +60,7 @@ int32_t main(int32_t argc, char* argv[])
     size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
     size_t outputByteSize = static_cast<size_t>(outEleNum) * sizeof(float);
     std::random_device rd;
-    std::mt19937 gen(rd());
+    std::mt19937 gen(static_cast<std::mt19937::result_type>(rd()));
     std::uniform_real_distribution<float> disX(1.0f, 9.0f);
     std::uniform_real_distribution<float> disY(1.0f, 9.0f);
 
diff --git a/atvc/examples/add_with_scalar/README.md b/atvc/examples/add_with_scalar/README.md
index 9dc434099cc48dd67227de4eed73386da187f88b..9c5164006010f246c49872570c3ecc57bb188d15 100644
--- a/atvc/examples/add_with_scalar/README.md
+++ b/atvc/examples/add_with_scalar/README.md
@@ -40,8 +40,8 @@
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests
-$ bash run_test.sh add_with_scalar
+$ cd ./atvc/examples
+$ bash run_examples.sh add_with_scalar
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp
index 24c4707032a33538bc727257ea340ec574cb51a3..d56bc6bdddb06dccee9a599ea32bfa25a55bc63a 100644
--- a/atvc/examples/add_with_scalar/add_with_scalar.cpp
+++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp
@@ -17,27 +17,9 @@
 #include "acl/acl.h"
 #include "elewise/elewise_host.h"
 #include "elewise/elewise_device.h" 
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 using OP_TRAITS =  ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
 
 template<typename Traits>
@@ -73,36 +55,6 @@ void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<floa
     }
 }
 
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-            if (!IsClose(golden[i], output[i])) {
-                printf("Accuracy verification failed! The expected value of element "
-                       "in index [%d] is %f, but actual value is %f.\n",
-                    i,
-                    golden[i],
-                    output[i]);
-                return false;
-            }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-
 void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice)
 {
     CHECK_ACL(aclrtFree(xDevice));
@@ -176,7 +128,7 @@ int main()
     std::vector<float> outputZ(reinterpret_cast<float *>(zHost), reinterpret_cast<float *>(zHost) + eleNum);
 
     CleanUp(zHost, xDevice, yDevice, zDevice);
-    CleanACL(stream, deviceId);
+    CleanACL(context, stream, deviceId);
 
     if (!VerifyResults(golden, outputZ)) {
             return -1;
diff --git a/atvc/examples/broadcast_to/README.md b/atvc/examples/broadcast_to/README.md
index 2699cb07d468b0e041680394b22e069207f2e18a..cfb7c0c8d9caaedd860a99cc03efc26dbac3e8e7 100644
--- a/atvc/examples/broadcast_to/README.md
+++ b/atvc/examples/broadcast_to/README.md
@@ -37,8 +37,8 @@ BroadcastTo算子规格：
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests
-$ bash run_test.sh broadcast_to
+$ cd ./atvc/examples
+$ bash run_examples.sh broadcast_to
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp
index c7a4b16baeac5dfc80ced66e199ea5cb52c5c56b..7bd28495b8e3d4d0f3acb54d6c8b638ca31e97b4 100644
--- a/atvc/examples/broadcast_to/broadcast_to.cpp
+++ b/atvc/examples/broadcast_to/broadcast_to.cpp
@@ -18,57 +18,9 @@
 #include "acl/acl.h"
 #include "broadcast/broadcast_host.h"
 #include "broadcast/broadcast_device.h"
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b) {
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("[ERROR]: Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-
 void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
 {
     CHECK_ACL(aclrtFree(xDevice));
@@ -136,7 +88,7 @@ int32_t main(int32_t argc, char* argv[])
     aclrtContext context;
     int32_t deviceId = 0;
     aclrtStream stream = nullptr;
-    InitializeACL(stream, context, deviceId);
+    InitializeACL(context, stream, deviceId);
     uint8_t *yHost;
     uint8_t *xDevice;
     uint8_t *yDevice;
@@ -163,7 +115,7 @@ int32_t main(int32_t argc, char* argv[])
 
     // 释放Acl资源
     CleanUp(xDevice, yDevice, yHost);
-    CleanACL(stream, context, deviceId);
+    CleanACL(context, stream, deviceId);
 
     for (int32_t i = 0; i < outEleNum; i++) {
         if (!IsClose(golden[i], outputY[i])) {
diff --git a/atvc/examples/common/example_common.h b/atvc/examples/common/example_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..b10155e55bdb5814c22321e7113bea457796b155
--- /dev/null
+++ b/atvc/examples/common/example_common.h
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_EXAMPLE_COMMON_H
+#define ATVC_EXAMPLE_COMMON_H
+#include "acl/acl.h"
+
+namespace {
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0)
+
+static constexpr float REL_TOL = 1e-3f;
+static constexpr float ABS_TOL = 1e-5f;
+
+bool IsClose(float a, float b)
+{
+    const float eps = 1e-40f;
+    float diff = std::abs(a - b);
+    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
+}
+
+bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
+{
+    for (int32_t i = 0; i < golden.size(); i++) {
+        if (!IsClose(golden[i], output[i])) {
+            printf("[ERROR]: Accuracy verification failed! The expected value of element "
+                   "in index [%d] is %f, but actual value is %f.\n",
+                i,
+                golden[i],
+                output[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+}  // namespace
+
+#endif
\ No newline at end of file
diff --git a/atvc/examples/ops_aclnn/README.md b/atvc/examples/ops_aclnn/README.md
index 802586c91f6cb52d69430f79563c2e3463c46a4e..f317fdaad7b312cd44a4a17a779ba2c402ae98a3 100644
--- a/atvc/examples/ops_aclnn/README.md
+++ b/atvc/examples/ops_aclnn/README.md
@@ -1,43 +1,11 @@
 ## 概述
-使用ATVC对接ACLNN工程简单的示例，适合初学者。
-
-## 自定义算子样例说明
-样例通过Ascend C编程语言实现了ATVC框架对接自定义算子工程，并按照算子调用方式分别给出了对应的端到端实现。
+使用ATVC框架开发自定义算子，并实现单算子API调用的样例。
 
 ## 算子开发样例
 |  目录名称                                                   |  功能描述                                               |
 | ------------------------------------------------------------ | ---------------------------------------------------- |
-| [add](./add) | 基于ATVC框架的Add自定义Vector算子及AclNNInvocation调用样例 |
-| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子及AclNNInvocation调用样例 |
-
-## 快速上手<a name="codeready"></a>
-
- 快速执行example用例，更详细的流程请参阅[add算子](../add/README.md)。
-
-- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/01_quick_start.md)。 
-
-- 导入ATVC环境变量
-  ```bash
-  # 如果不导入，默认使用./atvc/include路径
-  export ATVC_PATH=${atvc}/include
-  ```
-
- - 执行add用例
-   
-   ```bash
-    # 基于ATVC编译自定义Add算子
-    $ cd ./atvc/examples/ops_aclnn/add
-    # 以910B1为例，运行命令如下：
-    $ bash install.sh -v Ascend910B1
-    # 安装custom包
-    $ cd CustomOp/build_out
-    $ ./custom_opp*.run
-    # 样例运行
-    $ cd ../../AclNNInvocationNaive
-    $ bash run.sh
-    ...
-    test pass
-   ```
+| [add](./add) |  使用ATVC框架开发自定义算子Add，并实现单算子API调用的样例。 |
+| [reduce_sum](./reduce_sum) | 使用ATVC框架开发自定义算子ReduceSum，并实现单算子API调用的样例。 |
 
 ## 基于ATVC框架支持自定义算子
 ### 步骤1. 生成自定义工程基础目录及文件
@@ -296,4 +264,4 @@
 
 ### 步骤5. 调用执行算子工程 
 
-  算子文件编写完成，参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证
\ No newline at end of file
+  算子文件编写完成，参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证。
\ No newline at end of file
diff --git a/atvc/examples/ops_aclnn/add/README.md b/atvc/examples/ops_aclnn/add/README.md
index 42020ecffbbf614de35adac72434339bc0d7492b..d2909e71e9d1339e71cf75b35eb30eaeda1cba7f 100644
--- a/atvc/examples/ops_aclnn/add/README.md
+++ b/atvc/examples/ops_aclnn/add/README.md
@@ -29,10 +29,6 @@ z = x + y
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
 </table>
 
-## 支持的产品型号
-本样例支持如下产品型号：
-- Atlas A2训练系列产品 
-
 ## 算子工程介绍
 其中，算子工程目录AddCustom包含算子的实现文件，如下所示：
 ```
@@ -61,6 +57,11 @@ CANN软件包中提供了工程创建工具msOpGen，AddCustom算子工程可通
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
 
 ### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+  - 导入ATVC环境变量
+    ```bash
+    # 如果不导入，默认使用./atvc/include路径
+    $ export ATVC_PATH=${atvc}/include
+    ```
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
     # 若开发者以git命令行方式clone了master分支代码，并切换目录
diff --git a/atvc/examples/ops_aclnn/reduce_sum/README.md b/atvc/examples/ops_aclnn/reduce_sum/README.md
index 0ae318021bc6f2a8f55d60ec27f640b480f52b47..04a21d4955a9c204200cd257191ad405bebe1933 100644
--- a/atvc/examples/ops_aclnn/reduce_sum/README.md
+++ b/atvc/examples/ops_aclnn/reduce_sum/README.md
@@ -26,10 +26,6 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">reduce_sum_custom</td></tr>
 </table>
 
-## 支持的产品型号
-本样例支持如下产品型号：
-- Atlas A2训练系列产品
-
 ## 算子工程介绍
 其中，算子工程目录ReduceSumCustom包含算子的实现文件，如下所示：
 ```
@@ -58,6 +54,11 @@ CANN软件包中提供了工程创建工具msOpGen，ReduceSumCustom算子工程
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
 
 ### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+  - 导入ATVC环境变量
+    ```bash
+    # 如果不导入，默认使用./atvc/include路径
+    $ export ATVC_PATH=${atvc}/include
+    ```
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
     # 若开发者以git命令行方式clone了master分支代码，并切换目录
diff --git a/atvc/examples/ops_pytorch/README.md b/atvc/examples/ops_pytorch/README.md
index 5f85b795bcd0b3014831f618424ceef9ca28cc22..20e1299dbd881a4bb632f3da2a504fee4bbfd848 100644
--- a/atvc/examples/ops_pytorch/README.md
+++ b/atvc/examples/ops_pytorch/README.md
@@ -1,8 +1,5 @@
- ## 概述
-使用ATVC对接pytorch工程简单的示例，适合初学者。
-
-## pytorch算子样例说明
-样例通过Ascend C编程语言实现了ATVC框架对接pytorch算子，并按照算子调用方式分别给出了对应的端到端实现。
+## 概述
+使用ATVC开发自定义算子，并实现从PyTorch框架调用的样例。
 
 ## 算子开发样例
 |  目录名称                                                   |  功能描述                                              |
@@ -10,183 +7,50 @@
 | [add](./add) | 基于ATVC框架的Add自定义Vector算子 |
 | [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子 | 
 
-## 快速上手<a name="codeready"></a>
-
- 快速执行example用例，更详细的流程请参阅[add算子](./add/README.md)。
-
-- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/01_quick_start.md)。 
-
-- 导入ATVC环境变量
-    ```bash
-        # 如果不导入，默认使用./atvc/include路径
-        export ATVC_PATH=${atvc}/include
-    ```
-
- - 执行add用例
-    ```bash
-        # 基于ATVC编译pytorch Add算子
-        $ cd ./atvc/examples/ops_pytorch/add
-        $ bash run.sh
-        ...
-        OK
-    ```
-
-## 基于pytorch算子对接ATVC框架
-
-### 步骤1. 定义算子描述，参考[add_custom_impl.h](./add/add_custom_impl.h)
-
-    首先通过ATVC提供的`ATVC::OpTraits`模板结构体来描述Add算子的输入输出信息，定义如下：
-```cpp
-    // Add算子中有两个输入，一个输出。类型均为float
-    using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>；
-```
-
-### 步骤2. 实现算子计算逻辑，参考[add_custom_impl.h](./add/add_custom_impl.h)
-
-   用户需要通过AscendC API来搭建Add算子的核心计算逻辑，在ATVC框架中，这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数，并重载`operator()`来被提供的Kernel层算子模板类调用。
+## 基于PyTorch算子对接ATVC框架
 
+  不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择，用户在此处通过`<<<>>>`的方式调用核函数，更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。
+  
+  - 引入头文件。需要注意的是，需要保护对应核函数调用接口声明所在的头文件{kernel_name}_impl.h,kernel_name为算子的核函数名称。
 ```cpp
-    // 头文件引入
-    #include "elewise/elewise_host.h"
-    #include "elewise/elewise_device.h" 
-
-    // 传入编译态参数ATVC::OpTraits
-    template<typename Traits>
-    struct AddComputeFunc {
-        /*
-        函数说明： z = x + y
-        参数说明：
-            x                   : 参与运算的输入
-            y                   : 参与运算的输入
-            z                   : 参与运算的输出
-        */
-        template<typename T> 
-        // 重载operator，提供给算子模板类调用
-        __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
-            AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量
-        }
-    };
-```    
-### 步骤3.  实现核函数，参考[add_custom_impl.h](./add/add_custom_impl.h)
-    
-   ATVC提供的`ATVC::Kernel::EleWiseOpTemplate`算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化，用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算，完成完整核函数的实现。
-
-```cpp        
-    /*
-    * 该函数为Add算子核函数入口
-    * a        Device上的gm地址，指向Add算子第一个输入
-    * b        Device上的gm地址，指向Add算子第二个输入
-    * c        Device上的gm地址，指向Add算子第一个输出
-    * param    ATVC::EleWiseParam数据
-    */
-    template<class Traits>
-    __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, ATVC::EleWiseParam param)
-    {
-        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
-
-        auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<Traits>, ATVC::EleWiseParam*>();  // 将AddComputeFunc仿函数作为模板参数传入，实例化EleWiseOpTemplate模板类,并指明param的数据类型
-        op.Run(x, y, z, &param); // 按照输入、输出、param的顺序传入Run函数
-    } 
-```
-
- ### 步骤4. 编写torch入口[pytorch_ascendc_extension.cpp](./add/pytorch_ascendc_extension.cpp)
-
-  不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数，更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。
-
-``` cpp
     // 头文件引入
     #include <torch/extension.h>
     #include "torch_npu/csrc/core/npu/NPUStream.h"
     #include "add_custom_impl.h"
-
+```
+- 应用程序框架编写，需要注意的是，本样例输入x，y的内存是在python调用脚本[run_op.py](./add/run_op.py)中分配的。
+```cpp
     namespace ascendc_elewise_ops {
-    at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y)
-    {
-        auto stream = c10_npu::getCurrentNPUStream().stream(false);
-        at::Tensor z = at::empty_like(x);
-        // 创建ATVC框架[elewise]所需要的数据，算子数据的长度
-        int32_t totalLength = 1;
-        for (int32_t size : x.sizes()) {
-            totalLength *= size;
-        }
-        // 声明运行态参数param
-        ATVC::EleWiseParam param;
-        // Host侧调用Tiling API完成相关运行态参数的运算
-        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
-        // 调用核函数
-        AddCustom<AddOpTraitsFloat><<<param.tilingData.blockNum, nullptr, stream>>>(
-            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
-        
-        return z;
+    at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) {
     }
-    // 加载算子模版
-    TORCH_LIBRARY(ascendc_ops, m)
-    {   
-        // torch的自定义算子接口
-        m.def("add", &ascendc_elewise_ops::op_add_custom);
     }
-    }     
-```
-### 步骤5. 编写测试用例，参考[run_op.py](./add/run_op.py)
-```python
-    # 导入torch torch所需依赖
-    import torch
-    import torch_npu
-    # 导入测试用例依赖
-    from torch_npu.testing.testcase import TestCase, run_tests
-    torch.npu.config.allow_internal_format = False
-    # 加载bishengcc编译出的二进制文件
-    torch.ops.load_library('./libascendc_pytorch.so')
-
-    # 测试用例编写
-    class TestAscendCOps(TestCase):
-        # 测试用例
-        def test_add_custom_ops_float(self):
-            length = [8, 2048]
-            # 生成随机数
-            x = torch.rand(length, device='cpu', dtype=torch.float32)
-            y = torch.rand(length, device='cpu', dtype=torch.float32)
-            # 调用torch的自定义算子接口
-            npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
-            cpuout = torch.add(x, y)
-            self.assertRtolEqual(npuout, cpuout)   
-
-    if __name__ == '__main__':
-        run_tests()
 ```
-### 步骤6.  基于atvc框架pytorch的编译和测试脚本[run.sh](./add/run.sh)
-```bash   
-    # 获取torch、torch_npu、python的lib和include路径和atvc的路径       
-    torch_location=...
-    torch_npu_location=...
-    python_include=...
-    python_lib=... 
-    atvc_path=...
-
-    # 使用bishengcc进行编译pytorch算子
-    bishengcc pytorch_ascendc_extension.cpp \
-        -arch Ascend910B1   \
-        -I${torch_location}/include   \
-        -I${torch_location}/include/torch/csrc/api/include   \
-        -I${python_include}   \
-        -I${atvc_path}   \
-        -I${torch_npu_location}/include   \
-        -L${torch_location}/lib   \
-        -L${torch_npu_location}/lib   \
-        -L${python_lib}   \
-        -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
-        -o libascendc_pytorch.so   \
-        -shared
-
-    # 执行测试用例
-    python3 run_op.py
-```
-
-### 步骤7. 算子编译&执行
-```bash
-    # 基于ATVC编译pytorch Add算子
-    $ cd ./atvc/examples/ops_pytorch/add
-    $ bash run.sh
-    ...
-    OK
+- NPU侧运行验证。通过`<<<>>>`的方式调用核函数完成指定的运算。
+```cpp
+    // 运行资源申请，通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    // 分配Device侧输出内存
+    at::Tensor z = at::empty_like(x);
+    // totalLength是算子输入的元素个数
+    int32_t totalLength = 1;
+    for (int32_t size : x.sizes()) {
+        totalLength *= size;
+    }
+    // 声明运行态参数param
+    ATVC::EleWiseParam param;
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
+    // 使用<<<>>方式调用核函数完成指定的运算
+    AddCustom<AddOpTraitsFloat><<<param.tilingData.blockNum, nullptr, stream>>>(
+        (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+    // 将Device上的运算结果拷贝回Host并释放申请的资源
+    return z;
 ```
+ - 定义PyTorch算子的调用接口。
+``` cpp 
+    // 加载算子模版
+    TORCH_LIBRARY(ascendc_ops, m) {  // 模块名ascendc_ops，模块对象m
+        m.def("add", &ascendc_elewise_ops::op_add_custom); // 将函数add和PyTorch进行绑定
+    }
+       
+``` 
diff --git a/atvc/examples/ops_pytorch/add/README.md b/atvc/examples/ops_pytorch/add/README.md
index 39366061aee1a8aa7cc0495815ab77278adad311..8cb593d4f92a50a3a3cb2c18f4838767785faac1 100644
--- a/atvc/examples/ops_pytorch/add/README.md
+++ b/atvc/examples/ops_pytorch/add/README.md
@@ -1,12 +1,12 @@
   ## 概述
-本样例基于AddCustom算子工程，介绍了基于ATVC的单算子工程、单算子调用。
+本样例基于AddCustom算子工程，介绍了基于ATVC的PyTorch工程及调用。
 
 ## 目录结构介绍
 ```
 ├── add                                   
-│   ├── add_custom_impl.h                 // 通过pytroch调用的方式调用Add算子
-│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
-│   ├── run_op.py                         // pytorch的测试用例
+│   ├── add_custom_impl.h                 // 通过PyTorch调用的方式调用Add算子
+│   ├── pytorch_ascendc_extension.cpp     // PyTorch调用入口
+│   ├── run_op.py                         // PyTorch的测试用例
 │   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
 ```
 
@@ -20,86 +20,53 @@ z = x + y
 <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
 </tr>
 <tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">int32_t、float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">int32_t、float</td><td align="center">ND</td></tr>
 </tr>
 </tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">int32_t、float</td><td align="center">ND</td></tr>
 </tr>
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">AddCustom</td></tr>
 </table>
 
-## 支持的产品型号
-本样例支持如下产品型号：
-- Atlas A2训练系列产品 
-
 ## 编译运行样例算子
-针对pytorch算子，编译运行包含如下步骤：
-- 完成算子pytorch入口和impl文件的实现；
-- 编译pytorch算子的二进制文件；
-- 调用执行pytorch算子；
+针对PyTorch算子，编译运行包含如下步骤：
+- 完成算子PyTorch入口和impl文件的实现；
+- 编译PyTorch算子的二进制文件；
+- 调用执行PyTorch算子；
 
 详细操作如下所示。
 ### 1. 获取源码包及环境配置
 编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
-### 2. 安装pytorch环境
+### 2. 安装PyTorch环境
 参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境
 
-### 3. 基于ATVC编写pytorch算子的实现<a name="operatorcompile"></a>
-  - 编写kernel侧函数,参考[add_custom_impl.h](./add_custom_impl.h)
-    ```cpp
-    // 引入头文件
-    #include "elewise/elewise_host.h"
-    #include "elewise/elewise_device.h"
+### 3. 基于ATVC编写PyTorch算子的实现<a name="operatorcompile"></a>
+  - 算子kernel侧实现
+  
+    编写kernel侧函数，完成指定的运算。参考[add_custom_impl.h](./add_custom_impl.h)和[开发指南](../../../docs/02_developer_guide.md)完成核函数的实现。 
     
-    // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息，定义如下
-    using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
-    using AddOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int, int>, ATVC::OpOutputs<int>>;
-
-    // 实现算子计算逻辑
-    // 传入编译态参数ATVC::OpTraits
-    template<typename Traits>
-    struct AddComputeFunc {
-        /*
-        函数说明： z = x + y
-        参数说明：
-            x                   : 参与运算的输入
-            y                   : 参与运算的输入
-            z                   : 参与运算的输出
-        */
-        template<typename T> 
-        // 重载operator，提供给算子模板类调用
-        __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
-            AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量
-        }
-    };
-
-    //实现核函数
-    template <class Traits>
-    __global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param)
-    {
-        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
-        // 将AddComputeFunc仿函数作为模板参数传入，实例化EleWiseOpTemplate模板类
-        auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<Traits>>();
-        op.Run(x, y, z, &param);
-    }
-    ```
-  - 编写pytorch入口函数,并调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
+  - 编写PyTorch入口函数，并通过`<<<>>>`调用核函数，参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
     ```cpp
     at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y)
     {
+        // 运行资源申请，通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流
         auto stream = c10_npu::getCurrentNPUStream().stream(false);
+        // 分配Device侧输出内存
+        at::Tensor z = at::empty_like(x);
         ATVC::EleWiseParam param;
         int32_t totalLength = 1;
         for (int32_t size : x.sizes()) {
             totalLength *= size;
         }
         (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
+        // 使用<<<>>方式调用核函数完成指定的运算
         AddCustom<AddOpTraitsInt><<<param.tilingData.blockNum, nullptr, stream>>>(
             (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+        return z;
     }
     ```
-  - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py)
+  - 编写python调用函数,并调用pytorch入口函数，参考[run_op.py](./run_op.py)
 
     ```py
     # 引入头文件
@@ -107,16 +74,18 @@ z = x + y
     import torch_npu
     import numpy as np
     from torch_npu.testing.testcase import TestCase, run_tests
-    # 加载二进制
+    # 加载二进制文件
     torch.npu.config.allow_internal_format = False
     torch.ops.load_library('./libascendc_pytorch.so')
 
     class TestAscendCOps(TestCase):
       # 测试用例
       def test_add_custom_ops_float(self):
+          # 分配Host侧输入内存，并进行数据的初始化
           length = [8, 2048]    
           x = torch.rand(length, device='cpu', dtype=torch.float32)
           y = torch.rand(length, device='cpu', dtype=torch.float32)
+          # 分配Device侧内存，并将数据从Host上拷贝到Device上
           npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
           cpuout = torch.add(x, y)
           self.assertRtolEqual(npuout, cpuout)
@@ -125,9 +94,46 @@ z = x + y
     if __name__ == '__main__':
       run_tests()
       ```
+  - 编译和测试脚本，参考[run.sh](./run.sh)
+
+    ```sh
+    # 获取torch、torch_npu、python的lib和include路径和atvc的路径       
+    torch_location=...
+    torch_npu_location=...
+    python_include=...
+    python_lib=... 
+    if [ -z "$ATVC_PATH" ]; then
+        atvc_path=$(realpath ../../../include)
+    else
+        atvc_path=$ATVC_PATH
+    fi
+
+    # 使用bishengcc进行编译PyTorch算子
+    bishengcc pytorch_ascendc_extension.cpp \
+        -arch Ascend910B1   \
+        -I${torch_location}/include   \
+        -I${torch_location}/include/torch/csrc/api/include   \
+        -I${python_include}   \
+        -I${atvc_path}   \
+        -I${torch_npu_location}/include   \
+        -L${torch_location}/lib   \
+        -L${torch_npu_location}/lib   \
+        -L${python_lib}   \
+        -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
+        -o libascendc_pytorch.so   \
+        -shared
 
-### 4. 基于ATVC编写pytorch算子的调用验证
-  - 调用脚本，生成pytorch算子，并运行测试用例 
+    # 执行测试用例
+    python3 run_op.py
+    ```
+
+### 4. 基于ATVC编写PyTorch算子的调用验证
+  - 导入ATVC环境变量
+    ```bash
+    # 如果不导入，默认使用./atvc/include路径
+    $ export ATVC_PATH=${atvc}/include
+    ```
+  - 调用脚本，生成PyTorch算子，并运行测试用例 
       ```bash
       $ cd ./atvc/examples/ops_pytorch/add
       $ bash run.sh
diff --git a/atvc/examples/ops_pytorch/reduce_sum/README.md b/atvc/examples/ops_pytorch/reduce_sum/README.md
index 23175eb3de04e4f1a31ecab53173f9c59999663a..d154ffb5cdf69abe77c83bdd57c7c832e08b0a39 100644
--- a/atvc/examples/ops_pytorch/reduce_sum/README.md
+++ b/atvc/examples/ops_pytorch/reduce_sum/README.md
@@ -1,86 +1,58 @@
   ## 概述
-本样例基于ReduceSum算子，介绍了基于ATVC的单算子工程、单算子调用。
+本样例基于ReduceSum算子，介绍了基于ATVC的PyTorch工程及调用。
 
 ## 目录结构介绍
 ```
 ├── reduce_sum                                   
-│   ├── reduce_sum_impl.h                 // 通过pytroch调用的方式调用ReduceSum算子
-│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
-│   ├── run_op.py                         // pytorch的测试用例
+│   ├── reduce_sum_impl.h                 // 通过PyTorch调用的方式调用ReduceSum算子
+│   ├── pytorch_ascendc_extension.cpp     // PyTorch调用入口
+│   ├── run_op.py                         // PyTorch的测试用例
 │   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
 ```
 
 ## 算子描述
-Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
-```
-z = x + y
-```
+ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
 ## 算子规格描述
 <table>
 <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceSum</td></tr>
 </tr>
 <tr><td rowspan="2" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">int32_t、float</td><td align="center">ND</td></tr>
  
 </tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">8 * 2048</td><td align="center">int32_t、float</td><td align="center">ND</td></tr>
 </tr>
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceSumCustom</td></tr>
 </table>
 
-## 支持的产品型号
-本样例支持如下产品型号：
-- Atlas A2训练系列产品 
-
 ## 编译运行样例算子
-针对pytorch算子，编译运行包含如下步骤：
-- 完成算子pytorch入口和impl文件的实现；
-- 编译pytorch算子的二进制文件；
-- 调用执行pytorch算子；
+针对PyTorch算子，编译运行包含如下步骤：
+- 完成算子PyTorch入口和impl文件的实现；
+- 编译PyTorch算子的二进制文件；
+- 调用执行PyTorch算子；
 
 详细操作如下所示。
 ### 1. 获取源码包及环境配置
-编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
-### 2. 安装pytorch环境
-参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境
+  编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
+### 2. 安装PyTorch环境
+  参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境
 
-### 3. 基于ATVC编写pytorch算子的实现<a name="operatorcompile"></a>
-  - 编写kernel侧函数,参考[reduce_sum_impl.h](./reduce_sum_impl.h)
-    ```cpp
-    // 引入头文件
-    #include "reduce/reduce_host.h"
-    #include "reduce/reduce_device.h"
-
-    // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息，定义如下：
-    using ReduceOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
-    using ReduceOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
-
-    /*
-    * 该函数为ReduceSumCustom算子核函数入口
-    * x                 Device上的gm地址，指向Add算子第一个输入
-    * y                 Device上的gm地址，指向Add算子第一个输出
-    * reduceParam       ATVC::ReduceParam
-    */
-    template <class Traits>
-    __global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::ReduceParam reduceParam)
-    {
-      KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);  // 使用了多核控制指令，设置算子执行时只启动Vector核
-      // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
-      // ATVC::ReduceParam* 为tiling的类型
-      auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>,
-          Policy>();
-      op.Run(x, y, &reduceParam);
-    }
-    ```
-  - 编写pytorch入口函数,并调用kernel侧函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
+### 3. 基于ATVC编写PyTorch算子的实现<a name="operatorcompile"></a>
+  - 算子kernel侧实现
+
+    编写kernel侧函数，完成指定的运算。参考[reduce_sum_impl.h](./reduce_sum_impl.h)和[开发指南](../../../docs/02_developer_guide.md)完成核函数的实现。
+     
+  - 编写PyTorch入口函数，并通过`<<<>>>`调用核函数，参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
     ```cpp
-    // pytorch 入口函数
     at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector<int64_t> &dim)
-    {
+    {      
       std::vector<int64_t> shapeIn;
       std::vector<int64_t> shapeOut;
-      ATVC::ReduceParam param;  // Reduce运行态参数，包含TilingData以及临时空间的相关信息
-      ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+      // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+      ATVC::ReduceParam param;  
+      // Reduce运行态参数，负责映射最适合的Reduce模板实现
+      ATVC::ReducePolicy policy = {-1, -1, -1};  
       for (int32_t size : x.sizes()) {
           shapeIn.push_back(size);
           shapeOut.push_back(size);
@@ -89,6 +61,7 @@ z = x + y
           shapeOut[i] = 1;
       }
       auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device());
+      // 分配Device侧输出内存
       at::Tensor y = at::empty(shapeOut, options);
       if (x.scalar_type() == at::kFloat) {
           // Host侧调用Tiling API完成相关运行态参数的运算
@@ -109,6 +82,7 @@ z = x + y
       // 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
       void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy)
       {
+          // 运行资源申请，通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流
           auto stream = c10_npu::getCurrentNPUStream().stream(false);
 
           // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
@@ -121,7 +95,7 @@ z = x + y
       }  // namespace reduce
       }  // namespace AscendC
     ```
-  - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py)
+  - 编写python调用函数,并调用PyTorch入口函数,参考[run_op.py](./run_op.py)
 
     ```python
     # 引入头文件
@@ -136,8 +110,10 @@ z = x + y
     class TestAscendCOps(TestCase):
       # 测试用例
       def test_reduce_sum_ops_float(self):
+         # 分配Host侧输入内存，并进行数据的初始化
           length = [8, 2048] 
           x = torch.rand(length, device='cpu', dtype=torch.float32) 
+          # 分配Device侧内存，并将数据从Host上拷贝到Device上
           npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,))
           cpuout = torch.sum(x, (0,))
           self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) 
@@ -146,9 +122,46 @@ z = x + y
     if __name__ == '__main__':
       run_tests()
       ```
+  - 编译和测试脚本，参考[run.sh](./run.sh)
 
-### 4. 基于ATVC编写pytorch算子的调用验证
-  - 调用脚本，生成pytorch算子，并运行测试用例 
+    ```sh
+    # 获取torch、torch_npu、python的lib和include路径和atvc的路径       
+    torch_location=...
+    torch_npu_location=...
+    python_include=...
+    python_lib=... 
+    if [ -z "$ATVC_PATH" ]; then
+        atvc_path=$(realpath ../../../include)
+    else
+        atvc_path=$ATVC_PATH
+    fi
+
+    # 使用bishengcc进行编译PyTorch算子
+    bishengcc pytorch_ascendc_extension.cpp \
+        -arch Ascend910B1   \
+        -I${torch_location}/include   \
+        -I${torch_location}/include/torch/csrc/api/include   \
+        -I${python_include}   \
+        -I${atvc_path}   \
+        -I${torch_npu_location}/include   \
+        -L${torch_location}/lib   \
+        -L${torch_npu_location}/lib   \
+        -L${python_lib}   \
+        -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
+        -o libascendc_pytorch.so   \
+        -shared
+
+    # 执行测试用例
+    python3 run_op.py
+    ```
+
+### 4. 基于ATVC编写PyTorch算子的调用验证
+  - 导入ATVC环境变量
+    ```bash
+    # 如果不导入，默认使用./atvc/include路径
+    $ export ATVC_PATH=${atvc}/include
+    ```
+  - 调用脚本，生成PyTorch算子，并运行测试用例 
     ```bash
     $ cd ./atvc/examples/ops_pytorch/reduce_sum
     $ bash run.sh
diff --git a/atvc/examples/reduce_sum/README.md b/atvc/examples/reduce_sum/README.md
index 85f555c302e8176cc2bfa9a64133fa21ab417459..ca4cc75534f7951d0be23ca6e115babd31aa91ef 100644
--- a/atvc/examples/reduce_sum/README.md
+++ b/atvc/examples/reduce_sum/README.md
@@ -37,8 +37,8 @@ ReduceSum算子规格：
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests
-$ bash run_test.sh reduce_sum
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_sum
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp
index cf1fe4cf5203a5a4130f73a4bb7163735ac378c2..d00164b051fd832c3ec557bee6a4c0796cf45cc6 100644
--- a/atvc/examples/reduce_sum/reduce_sum.cpp
+++ b/atvc/examples/reduce_sum/reduce_sum.cpp
@@ -18,53 +18,12 @@
 #include "acl/acl.h"
 #include "reduce/reduce_host.h"
 #include "reduce/reduce_device.h"
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 // ReduceSum算子的描述：一个输入，一个输出，类型均为float
 using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
 
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-            if (!IsClose(golden[i], output[i])) {
-                printf("Accuracy verification failed! The expected value of element "
-                       "in index [%d] is %f, but actual value is %f.\n",
-                    i,
-                    golden[i],
-                    output[i]);
-                return false;
-            }
-    }
-    return true;
-}
-
-void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtDestroyContext(context));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-
 void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
 {
     CHECK_ACL(aclrtFree(xDevice));
diff --git a/atvc/tests/run_test.sh b/atvc/examples/run_examples.sh
similarity index 84%
rename from atvc/tests/run_test.sh
rename to atvc/examples/run_examples.sh
index 925f8de5d23ea4510703a4f44581779c3dd98fcd..e5a6282fd368fefa57a9a9e8a0c1f89ff4b02106 100644
--- a/atvc/tests/run_test.sh
+++ b/atvc/examples/run_examples.sh
@@ -23,9 +23,9 @@ else
 fi
 
 ATVC_HOME_DIR=$CURRENT_DIR/../
-TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples|xargs)
+TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples | grep -v '^run_examples.sh$' | grep -v '^ops_*' | xargs)
 if [ $# -lt 1 ]; then
-    echo "This script requires an input as the test case name. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'"
+    echo "This script requires an input as the test case name. Execution example: 'bash run_examples.sh [$TEST_CASE_LIST]'"
     exit 1
 fi
 TEST_NAME=$1
@@ -46,16 +46,16 @@ function compile_operator(){
     cd $ATVC_HOME_DIR/examples/$TEST_NAME
     if [ -z "$RUN_MODE" ]; then
         echo "Executing with npu mode"
-        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common
     elif [ "$RUN_MODE" = "debug_print" ]; then
         echo "Executing with debug_print mode"
-        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include -DATVC_DEBUG_MODE=1
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=1
     elif [ "$RUN_MODE" = "profiling" ]; then
         echo "Executing with profiling mode"
-        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include -DATVC_DEBUG_MODE=2
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=2
     else
         echo "--npu-mode is an optional parameter and can be left unset. If set, the value must be debug_print or profiling."
-        echo "Execution example: 'bash run_test.sh $TEST_NAME --run-mode=debug_print'"
+        echo "Execution example: 'bash run_examples.sh $TEST_NAME --run-mode=debug_print'"
         exit 1
     fi
 }
@@ -81,6 +81,6 @@ if [[ " $TEST_CASE_LIST " == *" ${TEST_NAME} "* ]]; then
     fi
     cd ${ATVC_HOME_DIR}
 else
-    echo "Error: Cannot find '$TEST_NAME' in ${ATVC_HOME_DIR}examples. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'"
+    echo "Error: Cannot find '$TEST_NAME' in ${ATVC_HOME_DIR}examples. Execution example: 'bash run_examples.sh [$TEST_CASE_LIST]'"
     exit 1
 fi
\ No newline at end of file
diff --git a/atvc/examples/sinh_custom/README.md b/atvc/examples/sinh_custom/README.md
index ec7f8a45713ff32f9cef327769257d5dbaae3d56..4c007f46568343801abcbb550b2e97adfc040799 100644
--- a/atvc/examples/sinh_custom/README.md
+++ b/atvc/examples/sinh_custom/README.md
@@ -37,8 +37,8 @@ SinhCustom算子规格：
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/tests
-$ bash run_test.sh sinh_custom
+$ cd ./atvc/examples
+$ bash run_examples.sh sinh_custom
 ...
 Generate golden data successfully.
 ...
diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp
index b8a18a08b849b7805574b5d5eabc27d7c6fa6c42..d8e0914e03cc23e8169a5413b4a65d3eca0238b8 100644
--- a/atvc/examples/sinh_custom/sinh_custom.cpp
+++ b/atvc/examples/sinh_custom/sinh_custom.cpp
@@ -18,27 +18,9 @@
 #include "acl/acl.h"
 #include "elewise/elewise_host.h"
 #include "elewise/elewise_device.h" 
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 // 描述算子的输入输出以及临时计算资源
 using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
 
@@ -75,36 +57,6 @@ void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<floa
         golden[i] = std::sinh(inputX[i]);
     }
 }
-
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
 }
 
 /*
@@ -166,7 +118,7 @@ int main()
     CHECK_ACL(aclrtFree(yDevice));
     CHECK_ACL(aclrtFreeHost(yHost));
 
-    CleanACL(stream, deviceId);
+    CleanACL(context, stream, deviceId);
 
     if (!VerifyResults(golden, outputY)) {
         return -1;
diff --git a/atvc/examples/tanh_grad/README.md b/atvc/examples/tanh_grad/README.md
index 5136b68b090b646c08912bbc2d3f24e91d3f9674..147de3cf108c767a189e18e5d29277ec591140b9 100644
--- a/atvc/examples/tanh_grad/README.md
+++ b/atvc/examples/tanh_grad/README.md
@@ -33,10 +33,10 @@ Tanh算子规格：
 </table>
 
 ## 算子基本功能验证
-在代码仓目录下执行：
+执行命令如下：
 ```bash
-$ cd ./atvc/tests/
-$ bash run_test.sh tanh_grad
+$ cd ./atvc/examples
+$ bash run_examples.sh tanh_grad
 ...
 Accuracy verification passed.
 ```
@@ -45,10 +45,10 @@ Accuracy verification passed.
 样例提供的主要调试调优方式如下：
 - 使用`ATVC::Host::EleWiseTilingHyperParam`构建超参对`ATVC::Host::CalcEleWiseTiling()`接口实现Tiling调优
 - 使用`--run-mode=debug_print`进行DFX信息打印：
-在代码仓目录下执行：
+执行命令如下：
 ```bash
-$ cd ./atvc/tests/
-$ bash run_test.sh tanh_grad --run-mode=debug_print
+$ cd ./atvc/examples
+$ bash run_examples.sh tanh_grad --run-mode=debug_print
 ...
 [INFO]:[ATVC][EleWise]Start to run Template Fuction.
 ...
@@ -62,15 +62,15 @@ Accuracy verification passed.
 ```
 
 - 使用`--run-mode=profiling`开启Profiling，获取性能数据：
-在代码仓目录下执行：
+执行命令如下：
 ```bash
-$ cd ./atvc/tests/
-$ bash run_test.sh tanh_grad --run-mode=profiling
+$ cd ./atvc/examples
+$ bash run_examples.sh tanh_grad --run-mode=profiling
 ...
 [INFO] Start Profiling ...
 ...
 [INFO] Process profiling data complete, Data is saved in /xxx_path
-Accuracy verification passed.
+...
 ```
 
-更多详细的调试调优介绍参考[ATVC开发指南](../../docs/2_developer_guide.md)的`ATVC的调试调优功能`章节
\ No newline at end of file
+更多详细的调试调优介绍参考[ATVC开发指南](../../docs/02_developer_guide.md)的`ATVC的调试调优功能`章节
\ No newline at end of file
diff --git a/atvc/examples/tanh_grad/tanh_grad.cpp b/atvc/examples/tanh_grad/tanh_grad.cpp
index 7f8693d6050f1355ef641e09bbcbaa7120a8548e..c444909fa1c31e95c378933c718a7c2adae80aab 100644
--- a/atvc/examples/tanh_grad/tanh_grad.cpp
+++ b/atvc/examples/tanh_grad/tanh_grad.cpp
@@ -17,19 +17,9 @@
 #include "acl/acl.h"
 #include "elewise/elewise_host.h"
 #include "elewise/elewise_device.h"
-
-#define CHECK_ACL(x)                                                                        \
-    do {                                                                                    \
-        aclError __ret = x;                                                                 \
-        if (__ret != ACL_ERROR_NONE) {                                                      \
-            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
-        }                                                                                   \
-    } while (0)
+#include "example_common.h"
 
 namespace {
-static constexpr float REL_TOL = 1e-3f;
-static constexpr float ABS_TOL = 1e-5f;
-
 struct MemoryPtrs {
     uint8_t *zHost;
     uint8_t *dyDevice;
@@ -38,14 +28,6 @@ struct MemoryPtrs {
     uint8_t *paramDevice;
 };
 
-// 判断两个浮点数是否足够接近
-bool IsClose(float a, float b)
-{
-    const float eps = 1e-40f; // 防止分母为零
-    float diff = std::abs(a - b);
-    return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
-}
-
 void InitializeData(int32_t eleNum, std::vector<float> &inputDy, std::vector<float> &inputY, std::vector<float> &golden)
 {
     std::random_device rd;
@@ -62,36 +44,6 @@ void InitializeData(int32_t eleNum, std::vector<float> &inputDy, std::vector<flo
     }
 }
 
-bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
-{
-    for (int32_t i = 0; i < golden.size(); i++) {
-        if (!IsClose(golden[i], output[i])) {
-            printf("[ERROR]: Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                i,
-                golden[i],
-                output[i]);
-            return false;
-        }
-    }
-    return true;
-}
-
-void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
-{
-    CHECK_ACL(aclInit(nullptr));
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    CHECK_ACL(aclrtCreateStream(&stream));
-}
-
-void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId)
-{
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
-}
-
 void CleanUp(uint8_t *&zHost, uint8_t *&dyDevice, uint8_t *&yDevice, uint8_t *&zDevice)
 {
     CHECK_ACL(aclrtFree(dyDevice));
diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h
index 8aa2755f5e13c14939e7966fdf4fe4ea66736fd6..1ec0d6b1c285545dbe596d01a93ea52e5072215f 100644
--- a/atvc/include/broadcast/broadcast_op_template.h
+++ b/atvc/include/broadcast/broadcast_op_template.h
@@ -369,7 +369,7 @@ private:
             return;
         }
         AscendC::DataCopyPad(input, srcGlobal_[copyInOffset], copyInParams, padParams);
-        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyIn] Offset is %u, block len is %u "
+        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyIn] Offset is %u, block len is %u "
             "block count is %u.\n", copyInOffset, copyInParams.blockLen, copyInParams.blockCount);
     }
 
@@ -402,7 +402,7 @@ private:
             return;
         }
         AscendC::DataCopyPad(dstGlobal_[copyOutOffset], output, copyOutParams);
-        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n",
+        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n",
                                   copyOutOffset, copyOutParams.blockLen, copyOutParams.blockCount);
     }
 
diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h
index 44ceb71e17fcaa9aa12a1cb83d4f433f49d441ca..480c04d2bfa53a537cc15a400d3d84265716ecb1 100644
--- a/atvc/include/reduce/common/reduce_common.h
+++ b/atvc/include/reduce/common/reduce_common.h
@@ -25,8 +25,8 @@ enum ShapeDim {
     DIM_7,
     DIM_8,
     DIM_9,
-    DIM_REDUCE,     // Reduce轴
-    DIM_BROADCAST   // Broadcast轴
+    DIM_REDUCE,     // Reduce axis
+    DIM_BROADCAST   // Broadcast axis
 };
 
 namespace AR_PATTERN {
diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h
index 0ab6bc9532c6f43f27e5f5d99b65e10cfd9c9260..93348539ae7f76c006edbe9d216d271fda8ffa4d 100644
--- a/atvc/include/reduce/reduce_host.h
+++ b/atvc/include/reduce/reduce_host.h
@@ -20,10 +20,10 @@
 
 namespace ATVC {
 namespace Host {
-/**
- * @brief Validate the legitimacy of reduce tiling hyper param
- * @param [in] hyperParam, reduce tiling hyper param
- * @return bool result, return true if the hyper param is valid, otherwise return false.
+/*!
+ * \brief Validate the legitimacy of reduce tiling hyper param
+ * \param [in] hyperParam, reduce tiling hyper param
+ * \return bool result, return true if the hyper param is valid, otherwise return false.
  */
 bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam)
 {
@@ -31,6 +31,7 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam)
     constexpr uint32_t MIN_BASE_LINE = 48 * 1024U;
     constexpr float MAX_INNER_A = 256;
     constexpr float MIN_INNER_A = 128;
+    constexpr float EPSILON = 1e-6f;
     constexpr float MIN_THRESH_HOLD = 0.8f;
     constexpr float MAX_THRESH_HOLD = 0.95f;
     if(hyperParam.basicBlock > MAX_BASE_LINE || hyperParam.basicBlock < MIN_BASE_LINE) {
@@ -43,8 +44,8 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam)
             "maxInnerA(%u) must be in [128, 256].\n", hyperParam.maxInnerA);
         return false;
     }
-    if(hyperParam.balanceThreshHold > MAX_THRESH_HOLD || hyperParam.balanceThreshHold < MIN_THRESH_HOLD) {
-        printf("[ERROR]: [ATVC][EleWise] Tiling hyperParam is invalid: nBufferNum(%f) must be in [0.8, 0.96].\n",
+    if(hyperParam.balanceThreshHold > MAX_THRESH_HOLD + EPSILON || hyperParam.balanceThreshHold + EPSILON < MIN_THRESH_HOLD) {
+        printf("[ERROR]: [ATVC][Reduce] Tiling hyperParam is invalid: balanceThreshHold(%f) must be in [0.8, 0.95].\n",
             hyperParam.balanceThreshHold);
         return false;
     }
@@ -52,13 +53,13 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam)
     return true;
 }
 
-/**
- * @brief 计算Reduce的TilingData和策略参数
- * @param inputShape 输入张量的形状。
- * @param reduceDim 需要进行Reduce操作的具体维度。
- * @param policy 输出参数。
- * @param param 输出参数。
- * @return bool 返回true表示计算成功，false表示失败。
+/*!
+ * \brief Calculate the TilingData and policy parameters for Reduce.
+ * \param[in] inputShape, shape of the tensor.
+ * \param[in] reduceDim, The dim that requires a Reduce operation.
+ * \param[out] policy, static policy of Reduce Template
+ * \param[out] param, dynamic param of Reduce Template
+ * \return bool Return true to indicate calculation success, false to indicate failure.
  */
 template<class OpTraits>
 bool CalcReduceTiling(std::vector<int64_t> inputShape,
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index 1c77be6f25c579b15500a54feb65c9d5f5629c28..fe466a0999aa0b16dad341a00b6d50fe00134ebe 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -30,6 +30,12 @@
 
 namespace ATVC {
 namespace Kernel {
+/*!
+ * ReduceOpTemplate Generic Reduce operator template.
+ * Reduce operators usually refer to operators that perform reduction operations on elements in tensors,
+ * such as summation and averaging. They can specify several dimensions for reduction calculations,
+ * or reduce all elements to a scalar.
+ */
 template <class ReduceCompute, const auto& SelectReducePolicy, 
             class PreCompute = void, class PostCompute = void>
 class ReduceOpTemplate {
@@ -49,19 +55,25 @@ public:
     constexpr static uint32_t PROMOTE_BUF_SIZE = KernelUtils::GetComputeCount<DataType>();
     AscendC::LocalTensor<PromoteDataType> tempBuf_;
     AscendC::LocalTensor<PromoteDataType> computeRes_;
-    // 算子开发者传入的计算对象
+    // The calculation object passed in by user
     ReduceCompute compute_;
 
 public:
     __aicore__ inline ReduceOpTemplate() {}
 
-    // 按照输入、输出、ReduceParam、其他标量的顺序传入
-    // 内部根据ReduceParam进行数据调度并调用ReduceOpTemplate完成计算后搬出到GM
+    /*!
+     * \brief The input order is: input tensor, output tensor, ReduceParam, Other scalars.
+     *        Internally schedule data based on ReduceParam and call ReduceOpTemplate to complete
+     *        the calculation before moving it out to GM.
+     * \param[in] x, GM address of the input tensor.
+     * \param[in] y, GM address of the output tensor.
+     * \param[in] param, tiling data and policy.
+     * \return void.
+     */
     template<typename ...Args>
     __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param) {
         ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Start to run template function.\n");
         param_ = param;  
-        // 完成入参校验打印
         KernelUtils::PrintParam<ReduceParam, SelectReducePolicy>(param_);
         if (!KernelUtils::CheckParam<ReduceParam>(param_)) {
             return;
@@ -72,6 +84,11 @@ public:
     }
 
 public:
+    /*!
+     * \brief Initialise all pipes, queues and buffers.
+     * \param[in] workspace, GM address for the workspace buffer.
+     * \param[in] args, GM addresses for input/output tensors.
+     */
     template <class... Args>
     __aicore__ inline void Init(GM_ADDR workspace, Args... args)
     {
@@ -90,18 +107,32 @@ public:
         pipe_->InitBuffer(tempUbQue_, BLOCK_SIZE_BYTE);
     }
 
+    /*!
+     * \brief Allocate a tensor from the internal buffer pool.
+     * \tparam IsInput, true  – tensor will be used as input (read-only)
+     *                  false – tensor will be used as output (read-write)
+     * \tparam needDup, if true the buffer is duplicated (for double-buffering).
+     * \param[in] tensor, LocalTensor reference that receives the allocation.
+     */
     template <bool IsInput, bool needDup = true, typename T>
     __aicore__ inline void AllocTensorAux(AscendC::LocalTensor<T>& tensor)
     {
         bufPool_.AllocTensor<IsInput, needDup>(tensor);
     }
 
+    /*!
+     * \brief Release a tensor back to the buffer pool.
+     * \param[in] tensor, LocalTensor to free.
+     */
     template <typename T>
     __aicore__ inline void FreeTensorAux(const AscendC::LocalTensor<T>& tensor)
     {
         bufPool_.FreeTensor(tensor);
     }
 
+    /*!
+     * \brief Execute the reduction schedule.
+     */
     template <class... Args>
     __aicore__ inline void Process(Args... args)
     {
@@ -118,7 +149,7 @@ public:
             SchTypeA op(this, input_, output_, &(this->param_)->tilingData);
             op.Process(args...);
         } else {
-            // 完成第一阶段的Reduce
+            // Complete the first phase of Reduce
             using SchTypeR = KernelUtils::Reduce::ReduceBlockAux<
                 ATVC::ReduceTilingData, &SCH_LOOP_INFO,
                 std::remove_reference_t<decltype(*this)>, DataType,
@@ -127,10 +158,10 @@ public:
             op.Process(args...);
             bufPool_.ResetEvent();
 
-            // 全核同步
+            // Full nuclear synchronization
             AscendC::SyncAll();
 
-            // 完成第二阶段的Reduce
+            // Complete the second phase of Reduce
             bufPool_.template ResetInputSize<PromoteDataType>(3);
             constexpr static ReduceSchLoopInfo groupSchLoopInfo = KernelUtils::Reduce::GetGroupSchLoopInfo();
             ATVC::ReduceTilingData groupTiling;
@@ -143,6 +174,11 @@ public:
         }
     }
 
+    /*!
+     * \brief Populate tiling data for the second (group) reduction phase.
+     * \param[in] groupTiling, Tiling structure to be filled.
+     * \return void.
+     */
     __aicore__ inline void SetGroupTiling(ATVC::ReduceTilingData& groupTiling)
     {
         groupTiling.ubFactorA = ELEMENT_ONE_REPEAT_COMPUTE;
@@ -162,6 +198,15 @@ public:
                                                           static_cast<uint64_t>(64));  // 按照64核计算，需要tiling传
     }
 
+    /*!
+     * \brief Copy input tensor to UB with optional padding.
+     * \tparam isPadding, true  – perform padding using PreCompute::GetPaddingValue
+     *                    false – no padding
+     * \param[in] src, GlobalTensor source in GM.
+     * \param[in] view, View descriptor describing the copy geometry.
+     * \param[in] shape, Shape descriptor (modified when padding).
+     * \param[in] ubTensor, LocalTensor destination in UB.
+     */
     template <bool isPadding, class EleT, class ViewDescT, class ShapeDescT>
     __aicore__ inline void CopyInAux(const AscendC::GlobalTensor<EleT> &src,
                                      ViewDescT &view, ShapeDescT &shape,
@@ -199,8 +244,9 @@ public:
             static_cast<int32_t>(view.axis[CONST6].srcStride)};
 
         int32_t total = 1;
-        for (int32_t i = 0; i < CONST6; ++i)
+        for (int32_t i = 0; i < CONST6; ++i) {
             total *= repeats[i];
+        }
 
         for (int32_t idx = 0; idx < total; ++idx) {
             int32_t tmp = idx;
@@ -216,6 +262,11 @@ public:
         }
     }
 
+    /*!
+     * \brief Copy input tensor directly to output when no reduction is required.
+     * \param void.
+     * \return void.
+     */
     __aicore__ inline void CopyInput2Output()
     {
         uint32_t shapeSize = 1;
@@ -318,7 +369,7 @@ protected:
     }
 
 private:
-    ATVC::ReduceParam* param_;   // CalcReduceTiling API计算出的运行态参数
+    ATVC::ReduceParam* param_;   // The runtime parameters calculated by CalcReduceTiling API
     AscendC::TPipe* pipe_;
     AscendC::TBuf<> oriVecQue_;
     AscendC::TBuf<> tempResQue_;
diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h
index 36c71a8c43c03d82b41181431802cd08114d49eb..08a24bf08d4101cf99dcf6a3aebe1b77b4e89e68 100644
--- a/atvc/include/reduce/reduce_sum.h
+++ b/atvc/include/reduce/reduce_sum.h
@@ -31,9 +31,14 @@ struct ReduceARParam {
 }
 
 namespace ATVC {
-// OpTraits: 算子描述的ATVC::OpTraits结构体
+/*!
+ * ReduceSumCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB).  Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
 template <typename OpTraits>
-// 计算模板，不感知数据从GM到UB上的搬运
 class ReduceSumCompute {
 public:
     // 从OpTraits中萃取算子输入描述信息
@@ -42,22 +47,34 @@ public:
     using PrompteDtype = typename KernelUtils::GetPromoteType<DataType>::T;
     __aicore__ inline ReduceSumCompute() {}
 
+    /*!
+     * \brief Perform the actual reduction on a tile already resident in UB.
+     * \tparam needMask, True when UB alignment introduced invalid lanes.
+     * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA.
+     * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+     * \param[out] dst, Destination tensor (length == dimA)
+     * \param[in] src, Source tensor (length == dimA * dimR)
+     */
     template <bool needMask, class Pattern>
     __aicore__ inline void
     Compute(KernelUtils::Shape<2> &shape,
             const AscendC::LocalTensor<PrompteDtype> &dst,
             const AscendC::LocalTensor<PrompteDtype> &src)
     {
-        // AR场景，硬件限制，R轴需要做UB上32B对齐，对齐方式有2种：
-        // 1. 高性能对齐(补充元素值不确定), 后续累加计算只能计算实际有效的元素个数
-        // 2. 补0对齐(补值是由用户实现的GetPaddingValue()接口决定的）
+        // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+        // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+        //    calculations can only calculate the actual number of effective elements
+        // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+        //    implemented by the user)
         if (std::is_same<Pattern, ReducePattern::AR>::value) {
-            if constexpr (needMask) { // 1. 高性能对齐模式
-                // MainR(int64_t dimR, bool isAR)： 框架提供的计算R轴二分长度(元素个数)， dimR为原始的元素个数
+            if constexpr (needMask) { // 1. High performance alignment mode
+                // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+                // length (number of elements), where dimR is the original number of elements
                 int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
                 ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
             } else {
-                // MainR：框架提供的计算R轴二分长度(元素个数)，dimR为补齐后的元素个数
+                // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+                // where dimR is the number of elements after completion
                 int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
                 ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
             }
@@ -68,6 +85,14 @@ public:
         }
     }
 
+    /*!
+     * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+     * \param[out] dst, Output tensor (length == dimA)
+     * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller)
+     */
     __aicore__ inline void
     ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
              const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
@@ -77,43 +102,45 @@ public:
         uint32_t mainNum = dimA * mainR;
         constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
         uint32_t tailNum = totalNum - mainNum;
-        // add mask最大值为256 bytes 且要满足32bytes对齐
+        // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
         constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
-        // 处理tail
         uint16_t repeatTimes = tailNum / maskAddNum;
         uint16_t repeatNum = repeatTimes * maskAddNum;
         uint16_t repTailNum = tailNum - repeatNum;
-        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长
+        // Same data block step size between different iterations
+        uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
         // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
         AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
         if (repeatTimes > 0) {
             AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
         }
         if (repTailNum > 0) {
-            repStride = dtypeSize * repTailNum / UB_ALIGN_32; // 不同迭代间同一datablock步长
+            // Same data block step size between different iterations
+            repStride = dtypeSize * repTailNum / UB_ALIGN_32;
             repeatParams.dstRepStride = repStride;
             repeatParams.src0RepStride = repStride;
             repeatParams.src1RepStride = repStride;
             AscendC::Add(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
         }
         AscendC::PipeBarrier<PIPE_V>();
-        // 二分主体
         uint16_t loopRNum = mainR;
         while (loopRNum > 1) {
             loopRNum = loopRNum >> 1;
-            mainNum = loopRNum * dimA; // LoopR的前半部分数据量
+            mainNum = loopRNum * dimA; // The first half of LoopR's data volume
             repeatTimes = mainNum / maskAddNum;
             repeatNum = repeatTimes * maskAddNum;
             repTailNum = mainNum - repeatNum;
             if (repeatTimes > 0) {
-                repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长
+                // Same data block step size between different iterations
+                repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
                 repeatParams.dstRepStride = repStride;
                 repeatParams.src0RepStride = repStride;
                 repeatParams.src1RepStride = repStride;
                 AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
             }
             if (repTailNum > 0) {
-                repStride = dtypeSize * repTailNum / UB_ALIGN_32; // 不同迭代间同一datablock步长
+                // Same data block step size between different iterations
+                repStride = dtypeSize * repTailNum / UB_ALIGN_32;
                 repeatParams.dstRepStride = repStride;
                 repeatParams.src0RepStride = repStride;
                 repeatParams.src1RepStride = repStride;
@@ -124,6 +151,15 @@ public:
         AscendC::DataCopy(dst, src, dimA);
     }
 
+    /*!
+     * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+     * \param[out] dstTensor, Output tensor (length == dimA)
+     * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB
+     * \param[in] dimA, Length of the non-reduced axis (A)
+     * \param[in] dimR, Padded length of the reduced axis (R)
+     * \param[in] mainR, Largest power-of-two ≤ original R length
+     * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail
+     */
     __aicore__ inline void
     ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
              const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
@@ -150,13 +186,12 @@ public:
             PerformInitialAdd(srcTensor, param);
         }
 
-        // 二分计算
         param.loopRNum = mainR;
         while (param.loopRNum > maskAddRNum) {
-            param.loopRNum = param.loopRNum / 2; // 除2二分
+            param.loopRNum = param.loopRNum / 2U;
             PerformBinaryReduction(srcTensor, param);
         }
-        if (param.loopRNum == 0) { // small shape, 直接reduce
+        if (param.loopRNum == 0) { // small shape, directly reduce
             param.loopRNum = tailR;
         }
         PerformFinalReduction(dstTensor, srcTensor, param);
@@ -170,17 +205,16 @@ public:
         int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
         int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
         int64_t stride = OpsUtils::CeilDiv(dimA, static_cast<int64_t>(element_one_repeat)) * element_one_repeat;
-        // count A轴的大小 * VL
         uint16_t outerLoopTimes = OpsUtils::CeilDiv(
             static_cast<int64_t>(dimA * sizeof(PrompteDtype)), static_cast<int64_t>(Platform::GetVRegSize()));
         uint16_t innerLoopTimes = cacheID;
         uint32_t outerLoopStride = element_one_repeat;
-        uint32_t innerLoopStride = stride;  // cacahe的每一个idex的块的大小， A轴的大小
+        uint32_t innerLoopStride = stride;  // The size of each idex block in cacahe and the size of the A-axis
         AscendC::LocalTensor<PrompteDtype> dstTensor = tempBuf;
         AscendC::LocalTensor<PrompteDtype> srcTensor = computeRes;
         uint32_t cah = cacheID * stride;
 
-        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // outerLoopTimes是dimA的大小
+        for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // OuterLoopTimes is the size of dimA
             uint32_t srcIdx = i * outerLoopStride;
             for (uint16_t j = 0; j < innerLoopTimes; ++j) {
                 AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx],
@@ -192,6 +226,13 @@ public:
         }
     }
 
+    /*!
+     * \brief Binary reduction between two UB buffers.
+     * \      Used for inter-core result merging when workspace staging is required.
+     * \param[in] ubTensorLeft, Left operand (in-place result).
+     * \param[in] ubTensorRight, Right operand (read-only).
+     * \param[in] calCount, Number of elements to reduce.
+     */
     __aicore__ inline void
     ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
                     const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
@@ -200,10 +241,18 @@ public:
         Add(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
     }
 
+    /*!
+     * \brief Return the value used for padding when UB alignment is required.
+     *        For SUM-reduction the neutral element is 0.
+     * \tparam U, Scalar type identical to DataType or PromoteDataType.
+     * \return The padding value (always 0).
+     */
     template <typename U>
-    __aicore__ inline U GetPaddingValue() // 设置框架内每一次搬入UB的数据对齐补充的值
+    __aicore__ inline U GetPaddingValue()
     {
-        U paddingValue = 0; // 由于ReduceSum是累加R轴数据，补齐的元素值设为0，才能保证累加的结果不受影响
+        // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements
+        // are set to 0 to ensure that the accumulated result is not affected
+        U paddingValue = 0;
         return paddingValue;
     }
 
@@ -216,7 +265,7 @@ private:
         AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
 
         if (param.repStride > UB_ALIGN_255) {
-            for (uint16_t i = 0; i < param.dimMax; i += param.mainR) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
                 AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
             }
         } else {
@@ -268,7 +317,7 @@ private:
         if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
                       AscendC::IsSameType<PrompteDtype, half>::value) {
             uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
-            // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
             for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
                 uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
                 AscendC::WholeReduceSum(
@@ -277,15 +326,14 @@ private:
             AscendC::PipeBarrier<PIPE_V>();
         } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
                              AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
-            // 尽量二分add到最后32bytes
-            // int32 -> float 都是4字,一把cast 用CAST_NONE
+            // Cast to float for higher-precision accumulation
             AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
             AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
             AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
             AscendC::PipeBarrier<PIPE_V>();
 
             uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
-            // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
+            // WholeReduceSum repeat-time limit is 255; split dimA into chunks
             for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
                 uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
                 AscendC::WholeReduceSum(
diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h
index ba9673558bbb104cb3929c5eab6449ca8da077c9..bcded4d2f83151dbf0a0dacc2bcb42372fcab759 100644
--- a/atvc/include/reduce/tiling/reduce_tiling.h
+++ b/atvc/include/reduce/tiling/reduce_tiling.h
@@ -21,32 +21,50 @@
 #include "tiling_common.h"
 
 namespace OpTiling {
+/*!
+ * ReduceOpTiling: High-level tiling engine for Ascend Reduce kernels.
+ * Computes cache-friendly UB blocks, multi-core split factors, loop counts and required workspace size.
+ * Writes everything into the user-supplied `ReduceParam` and `ReducePolicy` structures so the runtime
+ * can launch the kernel.
+ */
 class ReduceOpTiling {
 public:
-ReduceOpTiling(ReduceTilingInputParam& inputParam,
-    ATVC::ReducePolicy* policy, ATVC::ReduceParam* param, ATVC::Host::ReduceTilingHyperParam& hyperParam)
-    : param_(param), policy_(policy), opInput_(inputParam), hyperParam_(hyperParam) {
+    /*!
+     * \brief constructor that binds the engine to user-supplied output structures and input descriptor.
+     * \param[in] inputParam, compile-time description of the Reduce op
+     * \param[out] policy, static policy of Reduce Template
+     * \param[out] param, dynamic param of Reduce Template
+     */
+    ReduceOpTiling(ReduceTilingInputParam &inputParam, ATVC::ReducePolicy *policy, ATVC::ReduceParam *param,
+        ATVC::Host::ReduceTilingHyperParam &hyperParam)
+        : param_(param), policy_(policy), opInput_(inputParam), hyperParam_(hyperParam)
+    {
         compileInfo_ = ATVC::GetOpCompileInfo();
     };
 
-int32_t Run()
-{
-    MakeWrapDim(opInput_.reduceShape, opInput_.reduceDim);
-    if (IsAxesValid(opInput_.reduceShape, opInput_.reduceDim) == -1) {
-        return -1;
-    }
-    std::vector<uint64_t> newShape(ATVC::MAX_DIM, 1);
-    int32_t newShapeSize = 0;
-    EliminateOne(opInput_.reduceShape, opInput_.reduceDim, newShape, newShapeSize);
-    MergeAxis(opInput_.reduceDim, newShape, newShapeSize);
-    if (!DoTiling(newShape, newShapeSize)) {
-        printf("Do tiling failed!\n");
-        return -1;
+    /*!
+     * \brief Orchestrates the full tiling workflow; returns 0 on success, ‑1 otherwise.
+     * \param void
+     * \return 0 success, 1 failure
+     */
+    int32_t Run()
+    {
+        MakeWrapDim(opInput_.reduceShape, opInput_.reduceDim);
+        if (IsAxesValid(opInput_.reduceShape, opInput_.reduceDim) == -1) {
+            return -1;
+        }
+        std::vector<uint64_t> newShape(ATVC::MAX_DIM, 1);
+        int32_t newShapeSize = 0;
+        EliminateOne(opInput_.reduceShape, opInput_.reduceDim, newShape, newShapeSize);
+        MergeAxis(opInput_.reduceDim, newShape, newShapeSize);
+        if (!DoTiling(newShape, newShapeSize)) {
+            printf("Do tiling failed!\n");
+            return -1;
+        }
+        CalcWorkSpace();
+        printf("ReduceOpTiling Run success!\n");
+        return 0;
     }
-    CalcWorkSpace();
-    printf("ReduceOpTiling Run success!\n");
-    return 0;
-}
 
 private:
 template <class Pattern>
@@ -111,9 +129,9 @@ void CalcWorkSpace()
 void EliminateOne(const std::vector<int64_t>& oriShape, std::vector<int64_t>& axes, std::vector<uint64_t>& shape,
                                   int32_t& shapeSize)
 {
-    int32_t dstIdx = 1;  // shape中第一个数给了1, 跳过第一个数
+    int32_t dstIdx = 1;  // The first number in the shape is given as 1, so the first number is skipped.
     for (size_t i = 0; i < axes.size(); i++) {
-        // 前面补了一维，所有的axes需要加1
+        // The front dimension is filled, and all axes need to be increased by 1
         axes[i] = axes[i] + 1;
     }
     int32_t eraseNum = 0;
@@ -150,7 +168,7 @@ void MergeAxis(std::vector<int64_t>& axes, std::vector<uint64_t>& shape, int32_t
             }
             s *= shape[j];
             if (isRAxis1) {
-                // 连续的R轴, 需要擦除后续R轴的索引
+                // Continuous R-axis, need to erase the index of subsequent R-axis
                 axes.erase(iter1);
             }
         }
@@ -171,7 +189,7 @@ bool DoTiling(std::vector<uint64_t>& shape, int32_t shapeSize)
     switch (shapeSize) {
         case ATVC::CONST1:
             return ComputeTiling<ATVC::ReducePattern::A>(shape);
-            param_->tilingData.coreNum = 1; // A 場景：不用reduce 只用1個core copydata
+            param_->tilingData.coreNum = 1; // Scenario A: No reduce, only one core copydata
         case ATVC::CONST2:
             return ComputeTiling<ATVC::ReducePattern::AR>(shape);
         case ATVC::CONST3:
@@ -424,7 +442,7 @@ void ComputeUnitA(const std::vector<uint64_t>& shape)
     const bool isPatternA = (Pattern::ID == ATVC::ReducePattern::PATTERN_A);
     const uint64_t patternABlockSize = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype);
     uint64_t maxInnerA = isPatternA ? patternABlockSize : maxCacheA;
-    uint64_t stepLen = Pattern::ID == ATVC::ReducePattern::PATTERN_A ? A_STEP_LEN : 1;  // 纯A的步长为4, 减少循环次数
+    uint64_t stepLen = Pattern::ID == ATVC::ReducePattern::PATTERN_A ? A_STEP_LEN : 1;
     bool basicSplitA = (axisInCacheLine + (Pattern::FirstA ? 1 : 0)) % ATVC::CONST2;
     uint64_t bBlockNum = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype);
     uint64_t step = 1;
diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h
index 7226dfb27b9ccbe52aaf31adf11edb25e3d52485..6d5c6dcfde5565aad549b175f371bfa2304861d6 100644
--- a/atvc/include/reduce/tiling/tiling_common.h
+++ b/atvc/include/reduce/tiling/tiling_common.h
@@ -9,12 +9,12 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
- #ifndef ATVC_TILING_COMMON_H
+#ifndef ATVC_TILING_COMMON_H
 #define ATVC_TILING_COMMON_H
 #include "reduce/common/patterns.h"
 
 namespace OpTiling {
-constexpr static int32_t CACHE_SIZE = 16 * 1024;              // cahce size for ub reduce
+constexpr static int32_t CACHE_SIZE = 16 * 1024;  // cahce size for ub reduce
 constexpr static int32_t A_STEP_LEN = 4;
 
 struct ReduceTilingUnit {
@@ -44,12 +44,13 @@ struct ReduceTilingInputParam {
     std::vector<int64_t> reduceShape = {};
     ge::DataType inputDtype = ge::DataType::DT_UNDEFINED;
     ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED;
-    ReduceTilingInputParam(std::vector<int64_t> reduceDim_, std::vector<int64_t> reduceShape_, 
-            ge::DataType inputDtype_, ge::DataType promoteDtpye_):
-        reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_){}
+    ReduceTilingInputParam(std::vector<int64_t> reduceDim_, std::vector<int64_t> reduceShape_, ge::DataType inputDtype_,
+        ge::DataType promoteDtpye_)
+        : reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_)
+    {}
 };
 
-void MakeWrapDim(const std::vector<int64_t>& shape, std::vector<int64_t>& axes)
+void MakeWrapDim(const std::vector<int64_t> &shape, std::vector<int64_t> &axes)
 {
     // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0
     size_t shapeSize = shape.size();
@@ -70,7 +71,7 @@ bool IsAxisA(int32_t idx)
     }
 }
 
-int32_t IsAxesValid(const std::vector<int64_t>& shape, const std::vector<int64_t>& axes)
+int32_t IsAxesValid(const std::vector<int64_t> &shape, const std::vector<int64_t> &axes)
 {
     size_t shapeSize = shape.size();
     size_t axesSize = axes.size();
@@ -89,7 +90,7 @@ int32_t IsAxesValid(const std::vector<int64_t>& shape, const std::vector<int64_t
 }
 
 template <class Pattern>
-bool IsEmtpyTensor(const std::vector<uint64_t>& shape)
+bool IsEmtpyTensor(const std::vector<uint64_t> &shape)
 {
     for (int32_t i = 0; i < Pattern::Dim; i++) {
         if (shape[i] == 0) {
@@ -99,16 +100,18 @@ bool IsEmtpyTensor(const std::vector<uint64_t>& shape)
     return false;
 }
 
-}; // namespace OpTiling
+};  // namespace OpTiling
 
 namespace ATVC {
 namespace Host {
 // Hyper param for reduce tiling.
 struct ReduceTilingHyperParam {
-        uint32_t basicBlock = 48 * 1024;    // 设置Reduce基本块内存大小, 一般不能超过内存的1/3, 建议在[48k-54k]之间设置
-        uint32_t maxInnerA = 128;           // [128, 256]
-        double balanceThreshHold = 0.85;    // 多核均衡的阈值水平 [0.8-0.95]
+    // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. It is recommended to set
+    // it between [48k-54k]
+    uint32_t basicBlock = 48 * 1024;
+    uint32_t maxInnerA = 128;         // [128, 256]
+    double balanceThreshHold = 0.85;  // Threshold level for multi-core equilibrium [0.8-0.95]
 };
-}
-}
-#endif // ATVC_TILING_COMMON_H
\ No newline at end of file
+}  // namespace Host
+}  // namespace ATVC
+#endif  // ATVC_TILING_COMMON_H
\ No newline at end of file
diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h
index 2ffce9b6ded9180384392a8efb78661f3de96bda..a68a169fa900a1aaf21d907b525ab95973d3d6a7 100644
--- a/atvc/include/reduce/utils/reduce_block_aux.h
+++ b/atvc/include/reduce/utils/reduce_block_aux.h
@@ -75,7 +75,7 @@ public:
 
     struct {
         uint64_t start = 0;
-        uint64_t stride = 1; // 拷贝步长
+        uint64_t stride = 1; // Copy step size
     } iterAddr_[DIM];
 
     const DataType* tiling_;
@@ -127,7 +127,8 @@ public:
 
             int32_t rAxisIdx = LoopInfo->loopRCount - 1;
             int32_t rAxis = LoopInfo->loopRAxis[rAxisIdx];
-            loopRAxisStep_ = OpsUtils::CeilDiv(tiling_->shape[rAxis], tiling_->ubFactorR);  // 切分轴Rfactor的个数
+            // The number of split axis Rfactors
+            loopRAxisStep_ = OpsUtils::CeilDiv(tiling_->shape[rAxis], tiling_->ubFactorR);
 
             if constexpr (LoopInfo->loopACount > 0) {
                 int32_t aAxisIdx = LoopInfo->loopACount - 1;
@@ -143,12 +144,12 @@ public:
     __aicore__ inline void Process(Args... args)
     {
         SetLoopRange();
-        // 构造UB内轴index数组
-        // 1、尾轴
-        // 2、非循环轴
-        // 3、核外A循环轴最内轴且UbFactorA > 1
-        // 4、核外R循环轴最内轴且UbFactorR > 1
-        // 5、核内R循环轴最内轴且UbFactorR > 1
+        //Construct UB internal axis index array
+        //1. Tail axis
+        //2. Non cyclic axis
+        //3. The innermost axis of the extranuclear A-cycle axis and UbFactorA>1
+        //4. The innermost axis of the extranuclear R-cycle axis and UbFactorR>1
+        //5. The innermost axis of the nuclear R-cycle axis and UbFactorR>1
         if constexpr (LoopInfo->loopRCount == 0) {
             rCount_ = tiling_->factorRCntPerCore;
         } else {
@@ -232,7 +233,7 @@ public:
             SetEvent<AscendC::HardEvent::MTE2_V>(AscendC::HardEvent::MTE2_V);
             computeTensor = ubTensor;
         } else {
-            // AllocComputeTensorAux 的index 外部不需要感知
+            // The index of AlloccomputeTensorAux does not require external perception
             op_->ReduceOp::template AllocTensorAux<false>(computeTensor);
             CopyIn<isPadding>(view, shape, ubTensor);
             SetEvent<AscendC::HardEvent::MTE2_V>(AscendC::HardEvent::MTE2_V);
@@ -257,7 +258,8 @@ public:
             PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight);
             ComputeMerge(shape, computeLeft, computeRight, args...);
 
-            // fp32 tensorLeft和computeLeft是同一个tensor，fp16 computeLeft在free时不用index
+            // FP32 tensorLeft and computeLet are the same tensor, 
+            // while FP16 computeLet does not require an index when free
             op_->ReduceOp::template FreeTensorAux(computeRight);
             op_->compute_.template UpdateCache<Pattern, V>(i, shape, op_->tempBuf_, op_->computeRes_);
         }
@@ -282,7 +284,7 @@ public:
         if constexpr (LoopInnerRIdx != 0) {
             constexpr auto axis = LoopInfo->loopInnerRAxis[LoopInnerRIdx - 1];
             if constexpr (LoopInnerRIdx == LoopInfo->loopInnerRCount) {
-                // 最内层循环
+                // innermost loop
                 auto cur = basicBlockIdx % this->loopRAxisStep_;
                 this->iterAddr_[axis].start = cur * this->ubFactorR_;
                 this->iterAddr_[axis].stride = tiling_->shape[axis] - this->iterAddr_[axis].start;
@@ -304,7 +306,7 @@ public:
         if constexpr (LoopAIdx != 0) {
             constexpr auto axis = LoopInfo->loopAAxis[LoopAIdx - 1];
             if constexpr (LoopAIdx == LoopInfo->loopACount) {
-                // 切分轴
+                // Splitting axis
                 auto cur = step % this->loopAAxisStep_;
                 this->iterAddr_[axis].start = cur * this->ubFactorA_;
                 this->iterAddr_[axis].stride = tiling_->shape[axis] - this->iterAddr_[axis].start;
@@ -365,8 +367,8 @@ public:
         for (int32_t i = 0; i < DIM; i++) {
             addrOffset += iterAddr_[i].start * tiling_->stride[i];
         }
-        constexpr static auto burstLenAxis = DIM - 1;  // 获取第一个循环轴
-        view.addr = addrOffset;                        // 搬运地址
+        constexpr static auto burstLenAxis = DIM - 1;  // Get the first loop axis
+        view.addr = addrOffset;                        // Address to be moved
         view.burstLen = GetBurstLen<LoopInfo, burstLenAxis>(iterAddr_, tiling_);
         view.axisSize = 0;
         if constexpr (burstLenAxis > 0) {
@@ -441,7 +443,7 @@ public:
     template <bool isPadding, class U, class V>
     __aicore__ inline void CopyIn(U& view, V& shape, const AscendC::LocalTensor<InDtype>& ubTensor)
     {
-        // 计算在UB的 View
+        // Calculate the View in UB
         op_->ReduceOp::template CopyInAux<isPadding>(this->input_[0], view, shape, ubTensor);
     }
 
@@ -456,7 +458,7 @@ public:
     __aicore__ inline void ComputeMerge(V& shape, const AscendC::LocalTensor<PromoteDataType>& ubTensorLeft,
                                         const AscendC::LocalTensor<PromoteDataType>& ubTensorRight, Args... args)
     {
-        // Ub间Reduce
+        // Reduce between UB
         op_->compute_.ReduceBetweenUB(ubTensorLeft, ubTensorRight, shape.value[0] * shape.value[1]);
         op_->ReduceOp::template FreeTensorAux(ubTensorLeft);
         op_->ReduceOp::template ComputeAux<false, InnerPattern>(shape, ubTensorRight, args...);
diff --git a/atvc/include/reduce/utils/reduce_block_aux_util.h b/atvc/include/reduce/utils/reduce_block_aux_util.h
index bfffed420deef8fabd861d2daae34aca764a8325..52499e51dd7300e811c286b115bb84e9fc1f54fa 100644
--- a/atvc/include/reduce/utils/reduce_block_aux_util.h
+++ b/atvc/include/reduce/utils/reduce_block_aux_util.h
@@ -160,7 +160,7 @@ __aicore__ inline bool IsAxisA(int32_t idx) {
 
 template <auto LoopInfo, bool TailA, int32_t Dim, class T>
 __aicore__ inline int32_t GetInnerA(T& iterAddr) {
-    // 从后往前遍历到A的最内切分轴
+    // Traverse from back to front to the innermost dividing axis of A.
     int32_t startAxisA = 0;
     if constexpr (TailA) {
         startAxisA = Dim - 1;