From cf8713ff44ccf7211b904176dba8d2d572fa8cde Mon Sep 17 00:00:00 2001
From: qixingkai <qixingkai@huawei.com>
Date: Fri, 26 Sep 2025 12:00:56 +0800
Subject: [PATCH] fix atvc developer_guide;fix $ symbol

---
 atvc/README.md                                | 106 +-
 atvc/docs/01_quick_start.md                   |  71 +-
 atvc/docs/02_developer_guide.md               | 979 ++++++++++--------
 atvc/docs/03_code_organization.md             |  64 +-
 atvc/examples/README.md                       |  12 +
 atvc/examples/add/README.md                   |  26 +-
 atvc/examples/add_with_broadcast/README.md    |  29 +-
 atvc/examples/add_with_scalar/README.md       |  27 +-
 atvc/examples/broadcast_to/README.md          |  27 +-
 atvc/examples/ops_aclnn/README.md             |  44 +-
 .../ops_aclnn/add/AclNNInvocationNaive/run.sh |   1 -
 .../add/AddCustom/op_kernel/CMakeLists.txt    |   2 +-
 atvc/examples/ops_aclnn/add/README.md         |   2 +-
 atvc/examples/ops_aclnn/add/install.sh        |   1 -
 .../reduce_sum/AclNNInvocationNaive/run.sh    |   1 -
 atvc/examples/ops_aclnn/reduce_sum/README.md  |   4 +-
 .../ReduceSumCustom/op_kernel/CMakeLists.txt  |   2 +-
 atvc/examples/ops_aclnn/reduce_sum/install.sh |   1 -
 atvc/examples/ops_pytorch/README.md           |  22 +-
 atvc/examples/ops_pytorch/add/README.md       |   6 +-
 atvc/examples/ops_pytorch/add/run.sh          |  25 +-
 .../examples/ops_pytorch/reduce_sum/README.md |   6 +-
 atvc/examples/ops_pytorch/reduce_sum/run.sh   |  23 +-
 atvc/examples/reduce_sum/README.md            |  27 +-
 atvc/examples/run_examples.sh                 |  33 +-
 atvc/examples/sinh_custom/README.md           |  27 +-
 atvc/examples/tanh_grad/README.md             |  47 +-
 .../include/broadcast/broadcast_op_template.h |  15 +-
 .../broadcast/tiling/broadcast_tiling.h       |   6 +-
 .../broadcast/utils/broadcast_buf_pool.h      |  17 +-
 atvc/include/broadcast/utils/broadcast_util.h |  34 +-
 atvc/include/common/atvc_op_check.h           |   3 +-
 atvc/include/common/compile_info.h            |   2 +-
 atvc/include/common/kernel_utils.h            |   5 +
 atvc/include/elewise/elewise_op_template.h    |  23 +-
 atvc/include/elewise/utils/elewise_util.h     |  14 +-
 atvc/include/reduce/reduce_op_template.h      |  20 +-
 atvc/include/reduce/tiling/reduce_tiling.h    |   8 +-
 atvc/include/reduce/utils/reduce_buf_pool.h   |   6 +-
 atvc/include/reduce/utils/reduce_util.h       |  26 +-
 40 files changed, 903 insertions(+), 891 deletions(-)
 create mode 100644 atvc/examples/README.md
diff --git a/atvc/README.md b/atvc/README.md
index ef05c565..ad7b29a5 100644
--- a/atvc/README.md
+++ b/atvc/README.md
@@ -1,6 +1,7 @@
 # ATVC
+## 概述
 ATVC(Ascend C Template for Vector Compute)是一个用Ascend C API搭建的C++模板头文件集合，旨在帮助用户快速开发Ascend C典型Vector算子。它将Ascend C Vector算子开发流程中的计算实现解耦成可自定义的模块, 内部封装实现了kernel数据搬入搬出等底层通用操作及通用tiling计算，实现了高效的算子开发模式。
-相比传统Ascend C算子开发方式，利用ATVC搭建的Vector算子可做到开发效率提升3-5倍。用户只需选择匹配的模板并完成核心计算逻辑就完成算子kernel侧开发，atvc还内置了每个模板库对应的通用tiling计算实现，可省去用户手写tiling的开发量就能达到不错的性能表现，极大提升算子开发效率。
+相比传统Ascend C算子开发方式，利用ATVC搭建的Vector算子可做到开发效率提升3-5倍。用户只需选择匹配的模板并完成核心计算逻辑就完成算子kernel侧开发，ATVC还内置了每个模板库对应的通用tiling计算实现，可省去用户手写tiling的开发量就能达到不错的性能表现，极大提升算子开发效率。
 
 ![atvc_user_case.png](./docs/images/atvc_user_case.png)<br>
 
@@ -8,83 +9,56 @@ ATVC(Ascend C Template for Vector Compute)是一个用Ascend C API搭建的C++
 请参阅[快速入门](./docs/01_quick_start.md)以快速了解ATVC的Add算子搭建流程。
 请参阅[开发者文档](./docs/02_developer_guide.md)以获取ATVC框架各模板与API的使用细节，完成自定义Elementwise类算子以及Reduce类算子开发。
 
-# 环境要求
+
+## 支持的产品型号
 - 硬件型号支持  
-Atlas 800I A2推理服务器
-- 配套软件  
-CANN开发套件包Ascend-cann-toolkit\_\<cann_version>\_linux\-\<arch>.run，并设置相关环境变量
-cmake >= 3.16.0
+Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件。
 
-# 工程目录
-ATVC工程结构可归纳成以下主要组件，更详细的文件结构介绍请参阅[Code Organization](./docs/03_code_organization.md)：
 
+## 目录结构说明
+本代码仓目录结构如下：
 ``` 
 ├── docs        // 文档介绍
 ├── examples    // ATVC使用样例
 ├── include     // ATVC提供的头文件集合,用户使用前需将其置入其他工程的包含路径下
 └── README.md   // 综述
 ```
-# 快速上手
+[Developer Guide](./docs/02_developer_guide.md)给出了ATVC框架各模板与API的使用细节。
 
-快速执行example用例，更详细的流程请参阅[快速入门](./docs/01_quick_start.md)。
+[Code Organization](./docs/03_code_organization.md)给出了模板库代码的组织结构。
 
-- 下载ATVC代码
-```bash
-git clone https://gitee.com/ascend/ascendc-api-adv.git
-```
+[examples](./examples/)给出了使用ATVC模板库开发Vector算子的样例。
 
-- 配置环境变量
-  - 默认路径，root用户安装
-    ```bash
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh
-    source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
-    ```
-  - 默认路径，非root用户安装
-    ```bash
-    source $HOME/Ascend/ascend-toolkit/set_env.sh
-    source $HOME/Ascend/ascend-toolkit/latest/bin/setenv.bash
-    ```
-  - 指定路径安装
-    ```bash
-    source ${install_path}/ascend-toolkit/set_env.sh
-    source ${install_path}/ascend-toolkit/latest/bin/setenv.bash
-    ```
-
-
-- 执行add用例
-```bash
-cd ./atvc/examples
-bash run_examples.sh add
-```
+## 环境准备
+
+参考[ascendc-api-adv仓通用环境准备章节](../../README.md)，完成源码下载和CANN软件包及相关依赖的安装。
 
+## ATVC模板库算子调试方式
+- ATVC是一个头文件集合，只需要包含头文件目录即可使用ATVC模板能力进行算子开发。
+- [样例集合](../atvc/examples/)包含了多种模板、多种调用场景的算子样例，ops_aclnn和ops_pytorch展示了基于单算子API调用和PyTorch框架调用的算子样例，其他的均为Kernel直调场景下的样例。单算子API调用和PyTorch框架调用算子样例编译调试步骤详见对应样例路径下的README.md文档。
 
-# 已支持的模版
-| Vector模版类型                                           |
-| ------------------------------------------------------------ |
-| Elementwise模板 |
-| Reduce模板 |
-| Broadcast模板 |
-# 样例介绍
-| 样例名                                                         | 描述                                                         | 类型|
-| ------------------------------------------------------------ | ------------------------------------------------------------ |------------------------------------------------------------ |
-| [add](./examples/add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例               | Kernel直调 |
-| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例               | Kernel直调 |
-| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例              | Kernel直调 |
-| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例               | Kernel直调 |
-| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例               | Kernel直调 |
-| [tanh_grad](./examples/tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例              | Kernel直调 |
-| [ops_aclnn](./examples/ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例               | 单算子API调用 |
-| [ops_pytorch](./examples/ops_pytorch) | 使用ATVC开发自定义算子，并实现从[PyTorch](https://gitee.com/ascend/pytorch)框架调用的样例                 | PyTorch框架调用 |
-| [add_with_broadcast](./examples/add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例                  | Kernel直调 |
-
-更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/01_quick_start.md)，其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to、tanh_grad、add_with_broadcast是ATVC的直调样例，ops_aclnn为基于ATVC对接aclnn工程的算子目录，ops_pytorch为基于ATVC对接PyTorch工程的算子目录。其中，ops_aclnn和ops_pytorch样例需要进入到example路径下按照README.md描述执行。
-
-
-# 支持场景
-
-| 算子模板                                                         | 数据类型                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| Elementwise | int32_t、float               |
-| Reduce | int32_t、float               |
-| Broadcast | int32_t、float               |
+### Kernel直调算子样例本地编译调试
+- Kernel直调算子样例可通过执行脚本快速发起算子编译与运行，运行命令如下所示， 其{op_name}是实际运行算子路径：
+```bash
+cd ./ops_templates/atvc/examples
+bash run_examples.sh {op_name}
+```
+- 支持上板运行打开profiling获取性能数据， 运行命令为：bash run_examples.sh {op_name} --run-mode=profiling。
+- 支持上板打印ATVC模板库提供的DFX信息，运行命令为：bash run_examples.sh {op_name} --run-mode=debug_print
+
+## 模板选择
+ATVC支持的模板和数据类型如下：
+| 算子模板        | 数据类型       | 规格限制说明 |
+| -------------------- | ---------------- | ---------- |
+| Elementwise | int32_t、float               |  |
+| Reduce | int32_t、float               | 当前只支持4维以内的reduce计算 |
+| Broadcast | int32_t、float            | 当前只支持2维以内对齐场景的计算 |
+
+### Elementwise类算子
+Elementwise类算子通常是指对张量进行元素级别的操作的函数或方法，包括但不限于加、减、乘、除及指数、对数、三角函数等数学函数。这类算子的特点是会逐元素进行计算操作，而不会改变输入数据的形状。常见的Elementwise算子有Add、Sub、Exp、Log、Sin、Sqrt等。
+### Reduce类算子
+Reduce类算子通常是指对张量中的元素进行归约操作的算子，通常用来求和、求平均值等操作，可指定某几个维度进行归约计算，也可以将所有元素归约计算为一个标量。常见的Reduce类算子有ReduceSum(求和)、ReduceMean(求平均值)、ReduceProdcut(累乘)、ReduceMax(求最大值)、ReduceMin(求最小值)、ReduceAny(or操作)、ReduceAll(and操作)。
+### Broadcast类算子
+Broadcast算子用于在张量形状不一致时实现张量间的逐元素运算。
+设张量 A 的 shape 为 (1, 5)，张量 B 的 shape 为 (3, 5)。为完成 C = A + B，首先需依据广播规则将 A 由 (1, 5) 扩展至 (3, 5)。该过程通过在长度为 1 的维度上复制数据，使两个张量的形状对齐，从而支持逐元素相加运算。
 
diff --git a/atvc/docs/01_quick_start.md b/atvc/docs/01_quick_start.md
index 28b12d4f..31a59aee 100644
--- a/atvc/docs/01_quick_start.md
+++ b/atvc/docs/01_quick_start.md
@@ -1,46 +1,27 @@
 # 快速入门
-这篇文档帮助你体验ATVC开发Add算子的整个流程。<br>
-
-# 环境要求
-ATVC对软硬件运行环境有如下要求：
-- 硬件型号
-    Atlas 800I A2推理服务器
-- 软件要求  
-Ascend-cann-toolkit 开发者套件包<br>
-cmake ≥ 3.16 
-
-# 环境准备
-- 安装CANN开发套件包。以下为root用户默认路径安装演示。
-```bash
-chmod +x Ascend-cann-toolkit_<version>_linux-<arch>.run
-./Ascend-cann-toolkit_<version>_linux-<arch>.run --install
-```
-- 设置CANN环境变量
-
-```bash
-# root用户安装（默认路径）
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
-
-```
+这篇文档帮助你体验ATVC开发Add算子的整个流程，帮助开发者快速熟悉使用ATVC模板库开发算子的基本步骤。完整样例请参考[add examples](../examples/add/add.cpp)。<br>
 
+## 环境准备
+- 硬件型号支持  
+快速入门样例仅支持Atlas A2训练系列产品、Atlas 800I A2推理产品、A200I A2 Box 异构组件。
+- 配套软件  
+参考[ascendc-api-adv仓通用环境准备章节](../../../README.md)，完成源码下载和CANN软件包及相关依赖的安装。
 - 下载代码
-
 ```bash
 git clone https://gitee.com/ascend/ascendc-api-adv.git
 ```
 
-# 使用ATVC开发Add算子
-
+## 算子实现
 本示例将展示如何基于ATVC提供的模板以及接口快速搭建Add算子，示例内展示了ATVC框架下区别于传统Ascend C Add的实现代码。<br>
-## 定义算子描述
+
+### 定义算子描述
 首先通过ATVC提供的[ATVC::OpTraits](../include/common/atvc_opdef.h)模板结构体来描述Add算子的输入输出信息，定义如下：
 ```cpp
 // Add算子中有两个输入，一个输出。类型均为float
 using ADD_OPTRAITS = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
 ```
 
-## 实现算子计算逻辑
+### 实现算子计算逻辑
 用户需要通过Ascend C API来搭建Add算子的核心计算逻辑，在ATVC框架中，这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数，并重载`operator()`来被提供的Kernel层算子模板类调用。<br> 
 Add算子的计算仿函数定义如下：
 ```cpp
@@ -66,7 +47,7 @@ struct AddComputeFunc {
 
 
 ## 实现核函数
-ATVC提供的`ATVC::Kernel::EleWiseOpTemplate`算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化，用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算，完成完整核函数的实现。
+ATVC提供的[ATVC::Kernel::EleWiseOpTemplate](../include/elewise/elewise_op_template.h)算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化，用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算，完成完整核函数的实现。
 在`examples/add`用例中，算子核函数的形式参数除了输入输出之外，还需额外传入`GM_ADDR param`的形参。该参数包含算子模板类进行数据搬运数据的必要参数，由`ATVC::Host::CalcEleWiseTiling` API计算得出。
 <br>
 完整的`AddCustom`核函数定义如下：
@@ -108,37 +89,15 @@ uint32_t blockNum = param.tilingData.blockNum;
 // bDevice  Device上的gm地址，指向Add算子第二个输入
 // cDevice  Device上的gm地址，指向Add算子第一个输出
 // param    Device上的gm地址，指向运行态ATVC::EleWiseParam数据
-AddCustom<ADD_OPTRAITS><<<blockNum, nullptr, stream>>>(aDevice, bDevice, cDevice, paramDevice);
+AddCustom<ADD_OPTRAITS><<<blockNum, nullptr, stream>>>(aDevice, bDevice, cDevice, param);
 ```
+相比于通用的算子调用，ATVC内部封装实现了kernel数据搬入搬出等底层通用操作及通用tiling计算，实现了高效的算子开发模式。
 
 ## 算子编译&执行
 完成算子代码编写后，调用以下命令编译代码并执行：
 ```bash
-cd ./atvc/examples
+cd ./ops_templates/atvc/examples
 bash run_examples.sh add
 ```
 
-其他样例执行命令如下：
-```bash
-bash run_examples.sh sinh_custom      # 执行sinh_custom样例
-bash run_examples.sh reduce_sum       # 执行reduce_sum样例
-bash run_examples.sh add_with_scalar  # 执行add_with_scalar样例
-bash run_examples.sh broadcast_to     # 执行broadcast样例
-```
-
-## 完整样例<br>
-完整代码样例请参照[examples/add/add.cpp](../examples/add/add.cpp)
-
-# 模板选择
-用户根据待开发的Vector算子定义特征，选择以下三种匹配的模板及其配套的tiling算法，若自定义算子不在当前模板库的范围内，建议使用Ascend C API 手写算子。
-## Elementwise类算子
-Elementwise类算子通常是指对张量进行元素级别的操作的函数或方法，包括但不限于加、减、乘、除及指数、对数、三角函数等数学函数。这类算子的特点是会逐元素进行计算操作，而不会改变输入数据的形状。常见的Elementwise算子有Add、Sub、Exp、Log、Sin、Sqrt等。
-## Reduce类算子
-Reduce类算子通常是指对张量中的元素进行归约操作的算子，通常用来求和、求平均值等操作，可指定某几个维度进行归约计算，也可以将所有元素归约计算为一个标量。常见的Reduce类算子有ReduceSum(求和)、ReduceMean(求平均值)、ReduceProdcut(累乘)、ReduceMax(求最大值)、ReduceMin(求最小值)、ReduceAny(or操作)、ReduceAll(and操作)。
-## Broadcast
-Broadcast 算子用于在张量形状不一致时实现张量间的逐元素运算。
-设张量 A 的 shape 为 (1, 5)，张量 B 的 shape 为 (3, 5)。为完成 C = A + B，首先需依据广播规则将 A 由 (1, 5) 扩展至 (3, 5)。该过程通过在长度为 1 的维度上复制数据，使两个张量的形状对齐，从而支持逐元素相加运算。
-
-# Copyright
-
-Copyright (c) 2025 Huawei Technologies Co., Ltd.
\ No newline at end of file
+至此，您已完成Add算子开发的学习，可以参考[ATVC开发指南](02_developer_guide.md)了解更多ATVC模板的使用指导，可以参考[examples目录](../examples/)了解更多算子样例。
diff --git a/atvc/docs/02_developer_guide.md b/atvc/docs/02_developer_guide.md
index 50e33291..6caff012 100644
--- a/atvc/docs/02_developer_guide.md
+++ b/atvc/docs/02_developer_guide.md
@@ -10,11 +10,11 @@ ATVC将Vector算子开发流程中的可定制化模块抽象出了Host层和Ker
 
 <br> 
 
-## 1.1 公共数据结构
+# 2 公共数据结构
 我们将对ATVC核函数定义以及调用涉及的三个公共数据概念：算子原型的编译态参数`OpTraits`, Tiling计算的运行态参数`Param`, 模板策略的编译态参数`Policy` 分别进行介绍。
 
-### 1.1.1 OpTraits
-ATVC框架参考C++模板元编程的`type_list`实现，推出了`ATVC::OpInputs`、`ATVC::OpOutputs`、`ATVC::OpTemps`的模板结构体分别用于描述算子的计算输入、计算输出、计算过程的临时资源，支持C++基础类型作为不定长模板参数传入。它们三者组成了整个ATVC框架编译态参数`OpTraits`。`ATVC::OpTraits`的完整数据定义如下<br>
+## 2.1 OpTraits
+ATVC框架参考C++模板元编程的`type_list`实现，推出了`ATVC::OpInputs`、`ATVC::OpOutputs`、`ATVC::OpTemps`的模板结构体分别用于描述算子的计算输入、计算输出、计算过程的临时资源，支持C++基础类型作为不定长模板参数传入。它们三者组成了整个ATVC框架编译态参数`OpTraits`。`ATVC::OpTraits`的完整数据定义如下：<br>
 ```cpp
 // atvc_opdef.h
 namespace ATVC {
@@ -61,10 +61,10 @@ using AddOpTraits = ATVC::OpTraits<AddInputs, AddOutputs, AddTemps>; // Add算
 ```
 
 
-### 1.1.2 Param
+## 2.2 Param
 ATVC框架提供了`ATVC::EleWiseParam`、`ATVC::ReduceParam`、`ATVC::BroadcastParam` 三个结构体来描述算子内部调度的Tiling数据和其他资源变量。Param 作为Host侧Tiling API的输出，它将传入ATVC框架的Kernel层算子模板，并在运行时指导算子内部模块完成数据的循环搬运与调度计算。<br>
 
-以下为ElementWise类算子的`ATVC::EleWiseParam`参与计算的伪代码，详细使用流程请参考本文档的 [2.1.5 Host层API](#215-host层api)：
+以下为ElementWise类算子的`ATVC::EleWiseParam`参与计算的伪代码，详细使用流程请参考本文档的 [3.1.2.3 Policy与Param的计算与传递](#3123-policy与param的计算与传递)：
 ```cpp
 // 声明运行态参数param
 ATVC::ElewiseParam param;
@@ -80,10 +80,10 @@ aclrtMemcpy(paramDevice, sizeof(param), reinterpret_cast<uint8_t*>(&param), size
 EleWiseKernel<<<param.tilingData.blockNum, nullptr, stream>>>(x, y, z, paramDevice);
 ```
 
-### 1.1.3 Policy
+## 2.3 Policy
 编译态参数`Policy`(`ATVC::ReducePolicy`, `ATVC::BroadcastPolicy`)是ATVC框架里Kernel层对部分算子模板的拓展描述，它对应算子模板类在不同场景的实例化实现。它由Tiling API计算出，并在策略分派API(`ATVC::Host::ReduceAdapter`)里将运行态的Policy结果转化为模板参数并调用该场景下的最佳模板实现来完成高效的数据计算。<br>
 
-以下为Reduce算子开发场景中`ATVC::ReducePolicy`参与计算的伪代码，详细过程请参考<b>2.2.5 Host层API</b>：
+以下为Reduce算子开发场景中`ATVC::ReducePolicy`参与计算的伪代码，详细过程请参考[3.2.2.3 Policy与Param的计算与传递](#3223-policy与param的计算与传递)：
 ```cpp
 // 声明policy和param变量
 ATVC::ReducePolicy policy = {-1, -1, -1};
@@ -106,25 +106,60 @@ __global__ __aicore__ ReduceKernel(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam param
 ```
 
 
-
-# 2 利用ATVC完成算子开发
-## 2.1 Elementwise算子开发
+# 3 利用ATVC完成算子开发
+## 3.1 Elementwise算子开发
+### 3.1.1 Elementwise模板概述
 ATVC框架提供的Elementwise算子模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
 ![elewise_dataflow.png](images/elewise_dataflow.png)
-
 不同计算原理的Elementwise算子在Kernel内部的数据搬运模块并无区别，因此Elementwise的数据交互不涉及Policy的不同Kernel模板实现。
 
-### 2.1.1 Components
-根据Elementwise算子在框架内部的交互场景，ATVC提供如下的接口以及模板类帮助开发搭建自定义Ascend C的ElementWise算子：
+根据Elementwise算子在框架内部的交互场景，ATVC提供如下的接口以及模板类帮助开发搭建自定义Ascend C的Elementwise算子：
 ![elewise_components.png](images/elewise_components.png)
 自定义Elementwise算子需按照以下顺序完成模块之间的组装：
-1. 定义计算模板
-2. 将计算模板类传入Kernel层算子模板完成核函数功能实现；
-3. 定义Kernel层算子入口API，内部实例化计算模板类；
+1. 定义计算模板。
+2. 将计算模板类传入Kernel层算子模板完成核函数功能实现。
+3. 定义Kernel层算子入口API，内部实例化计算模板类。
 
-下面将以Sinh算子 $y = \frac{\exp(x) - \exp(-x)}{2}$ 的实现为样例，按照组成Kernel的顺序介绍Elementwise算子开发的流程。
 
-### 2.1.2 计算模板
+### 3.1.2 Elementwise算子开发步骤
+下面将以Sinh算子 $y = \frac{\exp(x) - \exp(-x)}{2}$ 的实现为样例，按照组成Kernel的顺序介绍Elementwise算子开发的关键步骤进行介绍。通过ATVC框架实现的完整SinhCustom算子样例代码见[样例代码](../examples/sinh_custom/sinh_custom.cpp)：
+```cpp
+// 描述算子的输入输出以及临时计算资源
+using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
+template<class Traits>
+// 开发自定义函数名/类名
+struct SinhComputeFunc {
+    // DataType模板参数，根据实际数据类型个数填写
+    template<typename T, typename U>
+    // 重载operator公有接口，提供给`ATVC::Kernel::EleWiseOpTemplate`调用
+    __aicore__ inline void operator()(AscendC::LocalTensor<T> x,
+                                      AscendC::LocalTensor<T> y,
+                                      AscendC::LocalTensor<U> tempBuffer1,
+                                      AscendC::LocalTensor<U> tempBuffer2)
+    {
+        // 开发调用AscendC Api自行实现计算仿函数
+        uint32_t tiledCnt = y.GetSize(); // 进行单次基块计算的元素个数
+        AscendC::Muls(tempBuffer1, x, static_cast<T>(-1), tiledCnt); // tempBuffer1 = -1 * x
+        AscendC::Exp(tempBuffer1, tempBuffer1, tiledCnt); // tempbuffer1 = exp(-x)
+        AscendC::Exp(tempBuffer2, x, tiledCnt);  // tempbuffer2 = exp(x)
+        AscendC::Sub(y, tempBuffer2, tempBuffer1, tiledCnt); // y = exp(x) - exp(-x)
+        AscendC::Muls(y, y, static_cast<T>(0.5), tiledCnt); // y = (e^(x) - e^(-x)) / 2 
+    }
+};
+...
+template<class OpTraits>
+__global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::EleWiseParam param)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 控制算子执行时只启动Vector核
+    auto op = ATVC::Kernel::EleWiseOpTemplate<SinhComputeFunc<OpTraits>>();
+    op.Run(x, y, &param);          // 按照输入、输出、param的顺序传入Run函数中；OpTraits内部的ATVC::OpTemps将由EleWiseOpTemplate内部申请资源，开发无需关注
+}
+...
+// 调用自定义的Kernel API, <<<>>>的BlockNum参数可通过param的TilingData获取
+SinhCustom<SinhOpTraits><<<param.tilingData.blockNum, nullptr, stream>>>(xDevice, yDevice, param);
+...
+```
+#### 3.1.2.1 实现计算逻辑
 计算模板是用户必须在Elementwise 算子实现过程中完成的一类特殊模板类的定义。模板类无需关注数据如何从GM搬运到UB，只需重载`operator()`的公有接口，并在该仿函数内部实现`AscendC::LocalTensor`之间的计算逻辑。在Kernel层的组装阶段，计算模板将作为模板参数传入`ATVC::Kernel::EleWiseOpTemplate`，并在数据计算阶段被调用。下方为计算模板实现Sinh计算逻辑的代码样例：
 ```cpp
 #include "atvc.h" // 包含所有模板及API的总入口头文件
@@ -153,69 +188,14 @@ struct SinhComputeFunc {
 3. 开发定义的`operator()`仿函数的输入参数类型支持`AscendC::LocalTensor<T>`以及C++其他基础数据类型。形式参数需按照`ATVC::OpInputs<>`,`ATVC::OpOutputs<>`, `ATVC::OpTemps<>`声明的顺序填入，其他标量参数放在最后，根据用户计算场景按需传入。
 
 
-### 2.1.3 内置Elementwise算子模板
-`ATVC::Kernel::EleWiseOpTemplate`为ATVC框架提供的内置ElementWise基本算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的算子流程。它需要计算模板类作为模板参数传入来完成实例化。核函数通过调用它完成整套计算逻辑：1. 资源初始化; 2.将数据从GM搬运至UB; 3.按`OpTraits`的输入、输出、临时资源描述、其他标量的顺序传入计算模板类的仿函数完成数据的基块计算; 4.将结果从UB搬出至GM。
-
-下方为`ATVC::Kernel::EleWiseOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[`atvc/include/elewise/elewise_op_template.h`](../include/elewise/elewise_op_template.h)。
-```cpp
-/*!
- * \brief EleWiseOpTemplate provides templates for element level operations on tensors,
- * including but not limited to addition, subtraction, multiplication, division, as well as
- * mathematical functions such as exponentiation, logarithm, trigonometric functions, etc.
- * The characteristic of this type of operator is that it performs calculation operations
- * element by element without changing the shape of the input data.
- */
-template <class EleWiseCompute>
-class EleWiseOpTemplate {
-public:
-    __aicore__ inline EleWiseOpTemplate() {};
-
-    /*!
-     * \brief The external running interface of EleWiseOpTemplate mainly completes resource initialization, 
-     *        data migration, calculation scheduling, and data migration operations
-     * \param src, GM pointer for input data
-     * \param dst, Gm pointer for outputting data
-     * \param broadcastParam, Dynamic parameters of broadcast, including tiling data, workspace, etc
-     */
-    template <typename... Args>
-    __aicore__ inline void Run(Args&&... args)
-    {
-        //
-        // 完成变长参数的解析和数据调度计算
-        //
-    }
-};
-```
-
-
-`ATVC::Kernel::EleWiseOpTemplate`在核函数实现中的调用样例如下：
-```cpp
-#include "atvc.h"
-// SinhOpTraits 为ATVC对自定义Sinh算子的数据类型描述
-using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
-//核函数内部调用
-// ...
-{
-// 将计算模板类模板定义作为模板参数传入
-ATVC::Kernel::EleWiseOpTemplate<SinhComputeFunc<SinhOpTraits>> elewiseTemplate;
-// 调用EleWiseOpTemplate的Run接口传入输入x， 输出y，Host::CalcEleWiseTiling API的输出param
-elewiseTemplate.Run(x, y, &param);
-}
-```
+#### 3.1.2.2 实例化模板
 
+在Elementwise开发流程中，用户需要自行定义核函数接口。核函数内部通过`ATVC::Kernel::EleWiseOpTemplate`完成模板实例化。
 
-### 2.1.4 核函数定义
-在Elementwise开发流程中，用户需要自行定义核函数接口。核函数内部可依赖`ATVC::Kernel::EleWiseOpTemplate`完成功能实现。
-
-基于`2.1.2`和`2.1.3`的样例代码，Kernel层的自定义核函数代码样例如下：
+Kernel层的自定义核函数代码样例如下：
 ```cpp
 #include "atvc.h"
 
-// 2.1.2 章节中的SinhComputeFunc 定义
-// template <class Traits>
-// struct SinhComputeFunc {
-// ...
-// }
 using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
 
 /*
@@ -236,10 +216,10 @@ __global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::EleWiseParam p
 
 利用ATVC框架开发Elementwise算子的过程中，Kernel层的核函数定义必须遵从以下约束：
 
-1. 核函数必须预留一个GM_ADDR类型的形参用于传入`ATVC::EleWiseParam`运行态参数；
-2. 核函数内部必须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);`这段代码标注算子执行时只启动Vector核；
-3. 核函数必须初始化`ATVC::Kernel::EleWiseOpTemplate`变量并调用它的`Run(Args&&... args)`接口来实现数据的调度运算；
-4. 若模板参数OpTraits固定（如算子的输入类型不发生变动）的场景，上述的Kernel核函数的定义和调用代码可简化为:
+1. 核函数必须预留一个GM_ADDR类型的形参用于传入`ATVC::EleWiseParam`运行态参数。
+2. 核函数内部必须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);`这段代码标注算子执行时只启动Vector核。
+3. 核函数必须初始化`ATVC::Kernel::EleWiseOpTemplate`变量并调用它的`Run(Args&&... args)`接口来实现数据的调度运算。
+4. 若模板参数OpTraits固定（如算子的输入类型不发生变动）的场景，上述的Kernel核函数的定义和调用代码可简化为：
 ```cpp
 using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
 
@@ -251,7 +231,7 @@ extern "C" __global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::Ele
 }
 ```
 
-### 2.1.5 Host层API
+#### 3.1.2.3 Policy与Param的计算与传递
 ATVC的Host层提供了Elementwise算子的通用Tiling算法API `ATVC::Host::CalcEleWiseTiling`，它根据算子计算原型`ATVC::OpTraits`以及数据大小计算出包含`ATVC::EleWiseTilingData`的运行态参数`ATVC::EleWiseParam`。`ATVC::EleWiseParam`在运行时将参与模板算子数据搬运从而实现较优计算。<br>`ATVC::EleWiseTilingData`和`ATVC::EleWiseParam`的数据结构定义如下：
 ```cpp
 namespace ATVC{
@@ -272,23 +252,8 @@ struct EleWiseParam {
 ```
 
 
-`ATVC::Host::CalcEleWiseTiling`函数内部提供了影响Tiling算法的超参数结构体`EleWiseTilingHyperParam`，支持开发通过修改超参值来探索更好的算子性能。该API的使用样例如下，详细代码请参考[EleWise Tiling 算法](../include/elewise/elewise_host.h)：
+`ATVC::Host::CalcEleWiseTiling`函数内部提供了影响Tiling算法的超参数结构体`EleWiseTilingHyperParam`，支持开发通过修改超参值来探索更好的算子性能。
 ```cpp
-// 传入编译态参数ATVC::OpTraits，函数内部将萃取该模板参数获取算子信息
-template <class OpTraits>
-/*!
- * \brief Calculate the operational parameters of EleWiseParam for EleWise
- * \param[in] totalCnt, The total number of elements in a single input
- * \param[out] param, Output parameters.
- * \return Return true to indicate calculation success, false to indicate failure.
- */
-template <class OpTraits>
-bool CalcEleWiseTiling(
-    int32_t totalCnt, ATVC::EleWiseParam &param, EleWiseTilingHyperParam hyperParam = EleWiseTilingHyperParam())
-{
-    // ...
-}
-
 // Host侧调用示例
 using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
 int32_t eleNum = 8 * 2048; // Sinh算子单个输入Tensor含有8*2048个元素
@@ -300,31 +265,151 @@ if (!ATVC::Host::CalcEleWiseTiling<SinhOpTraits>(eleNum, param)) {
     return -1;
 };
 ```
+### 3.1.3 Elementwise模板说明
+`ATVC::Kernel::EleWiseOpTemplate`为ATVC框架提供的内置Elementwise基本算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的算子流程。它需要计算模板类作为模板参数传入来完成实例化。核函数通过调用它完成整套计算逻辑：1. 资源初始化; 2.将数据从GM搬运至UB; 3.按`OpTraits`的输入、输出、临时资源描述、其他标量的顺序传入计算模板类的仿函数完成数据的基块计算; 4.将结果从UB搬出至GM。
 
+下方为`ATVC::Kernel::EleWiseOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[`elewise_op_template头文件`](../include/elewise/elewise_op_template.h)。
+```cpp
+/*!
+ * \brief EleWiseOpTemplate provides templates for element level operations on tensors,
+ * including but not limited to addition, subtraction, multiplication, division, as well as
+ * mathematical functions such as exponentiation, logarithm, trigonometric functions, etc.
+ * The characteristic of this type of operator is that it performs calculation operations
+ * element by element without changing the shape of the input data.
+ */
+template <class EleWiseCompute>
+class EleWiseOpTemplate {
+public:
+    __aicore__ inline EleWiseOpTemplate() {};
 
-### 2.1.6 完整样例
-通过ATVC框架实现的完整SinhCustom算子定义&调用，完整样例代码见链接[样例代码](../examples/sinh_custom/sinh_custom.cpp)
+    /*!
+     * \brief The external running interface of EleWiseOpTemplate mainly completes resource initialization, 
+     *        data migration, calculation scheduling, and data migration operations
+     * \param src, GM pointer for input data
+     * \param dst, Gm pointer for outputting data
+     * \param broadcastParam, Dynamic parameters of broadcast, including tiling data, workspace, etc
+     */
+    template <typename... Args>
+    __aicore__ inline void Run(Args&&... args)
+    {
+        //
+        // 完成变长参数的解析和数据调度计算
+        //
+    }
+};
+```
+
+
+`ATVC::Kernel::EleWiseOpTemplate`在核函数实现中的调用样例如下：
+```cpp
+#include "atvc.h"
+// SinhOpTraits 为ATVC对自定义Sinh算子的数据类型描述
+using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
+//核函数内部调用
+// ...
+{
+// 将计算模板类模板定义作为模板参数传入
+ATVC::Kernel::EleWiseOpTemplate<SinhComputeFunc<SinhOpTraits>> elewiseTemplate;
+// 调用EleWiseOpTemplate的Run接口传入输入x，输出y，Host::CalcEleWiseTiling API的输出param
+elewiseTemplate.Run(x, y, &param);
+}
+```
+
+
+### 3.1.4 切分策略算法说明
+下方为`ATVC::Host::CalcEleWiseTiling`函数内部计算Tiling参数的步骤，详细代码请参考[EleWise Tiling 算法](../include/elewise/elewise_host.h)：
+
+- 计算`blockNum`：计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`), `blockNum`最小值为1， 最大值为平台提供的最大`vectorCore`值。
+- 计算达到UB上限的单核单输入元素个数值`ubLimitCnt`：UB上限内存大小 / 所有输入输出及temp单个元素的内存之和。
+- 计算`tiledCnt`：
+    - 计算每个核需要处理的数据元素量`avgElePerBlock = totalCnt / blockNum`。
+    - 根据`avgElePerBlock`所处的`splitDataShape`数据段，按照切分系数去切分基本块： `tiledCnt = dataSplitFactor / dataSplitFactor`。
+    - `tiledCnt`调整： 不超上限`ubLimitCnt`， 不小于下限32，且最后的`tiledCnt`要做32元素对齐。
+- 计算`tailBlockCnt`：总的基本块数量（`totalCopyCnt`）% block块的数量，即为剩余需要处理的尾块的数量。
+- 计算`numPerBlock`：计算每一个block需要处理多少个基本块，即`numPerBlock = totalCopyCnt / blockNum`。
+- 计算`tailElemCnt`：通过总元素的数量(`totalCnt`)和基本块元素的数量(`basicCnt`)，计算尾部元素的数量，即`tailElemCnt = totalCnt % basicCnt`。       
+```cpp
+template <class OpTraits>
+bool CalcEleWiseTiling(
+    int32_t totalCnt, ATVC::EleWiseParam &param, EleWiseTilingHyperParam hyperParam = EleWiseTilingHyperParam())
+{
+    int32_t basicCnt = GetEleWiseBasicCnt(hyperParam, totalCnt, blockNum, ubufLimitCnt);
+    ...
+    param.tilingData.tiledCnt = basicCnt;
+    param.totalCnt = totalCnt;
+    uint32_t totalCopyCnt = totalCnt / basicCnt;
+    param.tilingData.tailBlockCnt = (totalCopyCnt) % blockNum;
+    param.tilingData.blockNum = blockNum;
+    param.tilingData.numPerBlock = totalCopyCnt / blockNum;  // The number of basic blocks to be transported per block
+    param.tilingData.tailElemCnt = totalCnt % basicCnt;      // The number of tail block elements
+    ...
+};
+
+```
+
+
+## 3.2 Reduce算子开发
+
+
+### 3.2.1 Reduce模板概述
 
-## 2.2 Reduce算子开发
 ATVC框架提供的Reduce算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
 ![reduce_dataflow.png](images/reduce_dataflow.png)
-Reduce模板算子内部根据计算的数据大小、shape、Reduce axis轴完成了不同计算调度的代码实现，ATVC将各种计算调度场景抽象为`ATVC::ReducePolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::ReducePolicy`转化为编译态参数，结合计算模板来实例化`ATVC::Kernel::ReduceOpTemplate`算子模板类。
+Reduce模板算子内部根据计算的数据大小、Shape、Reduce axis轴完成了不同计算调度的代码实现，ATVC将各种计算调度场景抽象为`ATVC::ReducePolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::ReducePolicy`转化为编译态参数，结合计算模板来实例化`ATVC::Kernel::ReduceOpTemplate`算子模板类。
 
-### 2.2.1 Components
 根据Reduce算子在框架内部的交互场景，ATVC提供如下的接口以及模板类帮助开发搭建自定义Reduce算子：
 ![reduce_components.png](images/reduce_components.png)
 自定义Reduce算子需按照以下顺序完成模块之间的组装：
-1. 自定义计算模板/使用框架内置计算模板
-2. 将计算模板传入Kernel层模板算子完成核函数功能实现；
-3. 定义Kernel层算子入口API，内部实例化计算模板类；
+1. 自定义计算模板/使用框架内置计算模板。
+2. 将计算模板传入Kernel层模板算子完成核函数功能实现。
+3. 定义Kernel层算子入口API，内部实例化计算模板类。
+
+
+
+### 3.2.2 Reduce算子开发步骤
+下面将以ReduceSum（对输入Tensor的特定轴上的数据做求和操作）的算子搭建为样例，按照组装顺序介绍Reduce算子类的关键步骤。通过ATVC框架实现的完整ReduceSum算子样例代码见[样例代码](../examples/reduce_sum/reduce_sum.cpp)：
+```cpp
+
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       指向运行态ATVC::ReduceParam数据
+*/
+template<typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
+{
+    ...
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+int32_t main(int32_t argc, char* argv[])
+{
+    ...
+    ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::Host::ReduceTilingHyperParam hyperParam;
+    hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param, hyperParam=hyperParam)) {
+        printf("Reduce tiling error.\n");
+        return -1;
+    };
 
-下面将以ReduceSum（对输入tensor的特定轴上的数据做求和操作）的算子搭建为样例，按照组装顺序介绍Reduce算子类的开发流程。
+    // 调用Adapter调度接口，完成核函数的模板调用
+    ReduceOpAdapter<ReduceOpTraits>(xDevice, yDevice, param, policy, stream);
+    ...
+}
 
+```
 
-### 2.2.2 计算模板
+#### 3.2.2.1 实现计算逻辑
 Reduce类的计算模板涉及多核之间的数据结果同步以及核内分块的对齐计算，用户自定义难度较高，因此ATVC框架提供了Reduce类的内置计算模板，并实现了Reduce在单核内与多核间的计算与同步等函数接口。
 这类计算模板将作为模板参数传入`ATVC::ReduceOpTemplate`中，并在数据计算以及同步阶段被调用。
-下方为ATVC框架内置的`ATVC::ReduceSumCompute`计算模板的伪代码介绍，完成代码定义请参考[reduce_sum.h](../include/reduce/reduce_sum.h)。
+下方为ATVC框架内置的`ATVC::ReduceSumCompute`计算模板的伪代码介绍，完整代码定义请参考[reduce_sum.h](../include/reduce/reduce_sum.h)。
 ```cpp
 #ifndef ATVC_REDUCE_SUM_COMPUTE_H
 #define ATVC_REDUCE_SUM_COMPUTE_H
@@ -419,76 +504,14 @@ Reduce计算模板类将在数据计算阶段被`ReduceOpTemplate`算子模板
 
 - 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数，如`ATVC::OpTraits<ATVC::OpInputs<float>,ATVC::OpOutputs<float>`。
 - 开发必须完成以下公有API的内部实现：
-    1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)`
-    2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)`
-    3. 计算多核之间&同一核内的多次UB结果 `__aicore__ inline void ReduceBetweenUB(...)`
-    4. 返回非对齐场景不参与计算的尾部数据的填充值 `__aicore__ inline U GetPaddingValue()`
-
-
-### 2.2.3 内置Reduce算子模板
-`ATVC::Kernel::ReduceOpTemplate`是一套基本的Reduce算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的流程。Kernel层的算子模板需要计算模板类作为模板参数传入来完成实例化。在调用阶段，算子类将按照固定参数顺序调用计算模板类的对应接口，完成数据的计算。
-相比Elementwise算子模板不同的是，ReduceOpTemplate内置了不同场景的Reduce实现，并在编译时通过`ATVC::ReducePolicy`类型的结构体来实现实例化。ReduceOpTemplate内部将根据模板参数决定数据将由哪类具体的模板实例计算。`ATVC::ReducePolicy`的数据定义如下：
-
-```cpp
-struct ReducePolicy {
-    int32_t patternID = -1;             // 描述Reduce轴与数据原本shape之间的关系
-    int32_t loopARCount = -1;           // 描述Reduce轴在多核之间的计算模式
-    int32_t loopInnerARCount = -1;      // 描述Reduce轴在单核之内的计算模式
-};
-```
-
-下方为`ATVC::Kernel::ReduceOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[reduce模板](../include/reduce/reduce_op_template.h)。
-```cpp
-/*!
- * ReduceOpTemplate Generic Reduce operator template.
- * Reduce operators usually refer to operators that perform reduction operations on elements in tensors,
- * such as summation and averaging. They can specify several dimensions for reduction calculations,
- * or reduce all elements to a scalar.
- */
-template <class ReduceCompute, const auto& SelectReducePolicy, 
-            class PreCompute = void, class PostCompute = void>
-class ReduceOpTemplate {
-public:
-    __aicore__ inline ReduceOpTemplate() {};
-
-    /*!
-     * \brief The input order is: input tensor, output tensor, ReduceParam, Other scalars.
-     *        Internally schedule data based on ReduceParam and call ReduceOpTemplate to complete
-     *        the calculation before moving it out to GM.
-     * \param[in] x, GM address of the input tensor.
-     * \param[in] y, GM address of the output tensor.
-     * \param[in] param, tiling data and policy.
-     * \return void.
-     */
-    template<typename ...Args>
-    __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param)
-    {
-        // ...
-    }
-};
-```
-
-
-`ATVC::Kernel::ReduceOpTemplate`的调用样例如下：
-```cpp
-#include "atvc.h"
-// ReduceSum算子的描述：一个输入，一个输出，类型均为float
-using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
-
-template <typename Traits, const auto& Policy>
-__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
-    // 将计算模板类模板定义作为模板参数传入， Policy由Host层的策略分派API给出
-    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy>();
-    op.Run(x, y, &reduceParam);
-}
-```
-
-### 2.2.4 核函数定义
-在ATVC提供了Reduce内部实现后，用户需要定义封装核函数接口。核函数功能可依赖`ATVC::Kernel::ReduceOpTemplate`的相关接口实现。
-
-基于`2.2.2`和`2.2.3`的样例代码，Kernel层的自定义API样例如下：
+    1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)`。
+    2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)`。
+    3. 计算多核之间&同一核内的多次UB结果 `__aicore__ inline void ReduceBetweenUB(...)`。
+    4. 返回非对齐场景不参与计算的尾部数据的填充值 `__aicore__ inline U GetPaddingValue()`。
+
+#### 3.2.2.2 实例化模板
+在Reduce开发流程中，用户需要定义封装核函数接口。核函数内部通过`ATVC::Kernel::ReduceOpTemplate`完成模板实例化。
+Kernel层的自定义API样例如下：
 ```cpp
 #include "atvc.h"
 
@@ -500,26 +523,26 @@ using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<flo
  * y                 Device上的gm地址，指向ReduceCustom算子第一个输出
  * reduceParam       指向运行态ATVC::ReduceParam数据
 */
-template <class OpTraits, ATVC::ReducePolicy& SelectPolicy>
-__global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam param)
+__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
 {
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV); // 使用了多核控制指令，设置算子执行时只启动Vector核
-    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<OpTraits>, SelectPolicy>();
-    op.Run(x, y, &param);
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
 }
 ```
 <br>
 
 Reduce算子开发场景下，核函数定义必须遵从以下约束：
-1. 核函数须预留一个GM_ADDR类型的形参用于传入`ATVC::ReduceParam`运行态参数；
-2. 核函数须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);`这段代码显示标注算子类型；
-3. 核函数须实例化`ATVC::Kernel::ReduceOpTemplate`变量并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam* param)`接口来实现数据的调度运算；
+1. 核函数须预留一个GM_ADDR类型的形参用于传入`ATVC::ReduceParam`运行态参数。
+2. 核函数须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);`这段代码显示标注算子类型。
+3. 核函数须实例化`ATVC::Kernel::ReduceOpTemplate`变量并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam* param)`接口来实现数据的调度运算。
 
 
-### 2.2.5 Host层API
-#### 2.2.5.1 CalcReduceTiling
-`ATVC::Host::CalcReduceTiling`是ATVC在Host侧提供的针对Reduce类算子的通用Tiling API，它以算子计算模板`ATVC::OpTraits`作为模板参数，根据实际计算场景得出`ATVC::ReduceParam`以及对应算子模板实现的`ATVC::ReducePolicy`<br>
-`ATVC::ReduceParam`用于保存Reduce数据计算搬运的`ATVC::ReduceTilingData`以及临时空间的资源变量，它们的数据结构定义如下：
+#### 3.2.2.3 Policy与Param的计算与传递
+##### 3.2.2.3.1 CalcReduceTiling
+`ATVC::Host::CalcReduceTiling`是ATVC在Host侧提供的针对Reduce类算子的通用Tiling API，它以算子计算模板`ATVC::OpTraits`作为模板参数，根据实际计算场景得出`ATVC::ReduceParam`以及对应算子模板实现的`ATVC::ReducePolicy`。
+`ATVC::ReduceParam`用于保存Reduce数据计算搬运的，`ATVC::ReduceTilingData`以及临时空间的资源变量，它们的数据结构定义如下：
 
 ```cpp
 //指导Reduce算子内部的数据调度
@@ -584,9 +607,9 @@ int32_t main(int32_t argc, char* argv[])
 }
 ```
 
-#### 2.2.5.2 ReduceOpAdapter
+##### 3.2.2.3.2 ReduceOpAdapter
 在调用CalcReduceTiling接口获取动态参数`ATVC::ReducePolicy`后，仍需要在Kernel直调阶段将其转化为编译态参数来实例化`ReduceOpTemplate`模板完成调用。ATVC框架对`ATVC::ReducePolicy`这特殊的模板参数提供了一套运行态参数转编译态参数的策略分派机制`ReduceOpAdapter`，它将框架`CalcReduceTiling`的输出作为输入，替用户完成`ATVC::ReduceParam`资源的申请及释放的同时，将运行态的`ATVC::ReducePolicy`参数转化为模板参数完成自定义核函数的策略选择调用。
-基于2.2.4 章节中核函数的接口定义，ATVC框架只给出`ReduceOpAdapter`的示范代码，用户亦可根据实际计算场景替换其中的核函数调用：
+基于[3.2.2.2](#3222-实例化模板) 章节中核函数的接口定义，ATVC框架只给出`ReduceOpAdapter`的示范代码，用户亦可根据实际计算场景替换其中的核函数调用：
 
 ```cpp
 // 当前ATVC框架支持的Reduce类算子的不同模板参数
@@ -643,142 +666,167 @@ using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<flo
 }
 ```
 
-
-
-### 2.2.6 完整样例
-通过ATVC框架实现的完整ReduceSum算子定义&调用，完整样例代码见链接[样例代码](../examples/reduce_sum/reduce_sum.cpp)
-
-## 2.3 Broadcast算子开发
-ATVC框架提供的Broadcast算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
-![broadcast_dataflow.png](images/broadcast_dataflow.png)
-Broadcast模板算子内部根据数据类型、输入/输出shape完成某个轴上数据扩充的功能，ATVC将各种计算调度场景抽象为`ATVC::BroadcastPolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::BroadcastPolicy`转化为编译态参数，结合计算模板来实例化`ATVC::Kernel::BroadcastOpTemplate`算子模板类。
-### 2.3.1 Components
-根据Broadcast算子在框架内部的交互场景，ATVC提供如下的接口以及模板类帮助开发搭建自定义Broadcast算子：
-![broadcast_components.png](images/broadcast_components.png)
-自定义Broadcast算子需按照以下顺序完成模块之间的组装：
-1. 自定义计算模板/使用框架内置计算模板
-2. 将计算模板传入Kernel层模板算子完成核函数功能实现；
-3. 定义Kernel层算子入口API，内部实例化计算模板类；
-
-下面将以BroadcastTo（对输入tensor在特定轴上做数据复制扩充操作）的算子搭建为样例，按照组装顺序介绍Broadcast类算子的开发流程。
-### 2.3.2 计算模板
-Broadcast计算模板是指Broadcast类算子在UB上实现将A轴的数据复制扩充到B轴上。在Kernel层的组装阶段，计算模板将作为模板参数传入`ATVC::Kernel::BroadcastOpTemplate`，并在数据计算阶段被调用。下方为对齐的代码样例：
-下方为ATVC框架内置的`ATVC::BroadcastCompute`计算模板的伪代码介绍，完整代码定义请参考[完整代码](../include/broadcast/broadcast_compute.h)。
-```cpp
-#ifndef ATVC_BROADCAST_COMPUTE_H
-#define ATVC_BROADCAST_COMPUTE_H
-#include "kernel_operator.h"
-#include "common/kernel_utils.h"
-#include "broadcast/common/broadcast_common.h"
-
-namespace ATVC {
-template <class OpTraits>
-class BroadcastCompute {
-public:
-    using inputDTypeList = typename OpTraits::In::types;
-    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
-
-    template <int32_t patternID>
-    __aicore__ inline void Compute(AscendC::LocalTensor<DataType> &src, uint32_t inputOffset,
-                                   AscendC::LocalTensor<DataType> &dst, uint32_t dimA, uint32_t dimB)
-    {
-        // 实现AB、BA两个场景的broadcast计算
-    }
-};
-}
-#endif // ATVC_BROADCAST_COMPUTE_H
-```
-Broadcast计算模板类将在数据计算阶段被`BroadcastOpTemplate`算子模板调用，因此Broadcast计算模板类的实现必须遵从以下约束：
-
-- 该模板类的`Compute`接口主要实现将UB上某一段输入数据进行B轴上的复制扩充。
-- compute参数说明：
-    * `src`: 是存放UB上的输入数据;
-    * `inputOffset`: 表示本次compute需要计算的一段数据段位于`src`的起始位置偏移量;
-    * `dst`: 是存放计算后的UB上的输出数据;
-    * `dimA`: 表示此次计算需要处理的输入数据长度;
-    * `dimB`: 表示此次计算需要扩充的长度，输入为dimA, 输出数据量为dimA*dimB;
-- `Compute`需要完成AB和BA两种场景的计算
-    * AB场景的计算：输入`src`是一个shape为(dimA, 1)的Tensor,需要将数据扩充到`dst`上，dst的shape是(dimA, dimB);
-    * BA场景的计算：输入`src`是一个shape为(1, dimA)的Tensor,需要将src数据扩充到`dst`上，dst的shape是(dimB, dimA);
-
-- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数，如`ATVC::OpTraits<ATVC::OpInputs<float>,ATVC::OpOutputs<float>`。
-
-### 2.3.3 内置Broadcast算子模板
-`ATVC::Kernel::BroadcastOpTemplate`是一套基本的Broadcast算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的流程。Kernel层的算子模板需要计算模板类作为模板参数传入来完成实例化。在调用阶段，Broadcast算子模板将按照固定参数顺序调用计算模板类的`Compute`接口，完成数据的计算。
-与Broadcast算子模板类似，Broadcast算子模板内置了不同场景的Broadcast实现，并在编译时通过`ATVC::BroadcastPolicy`类型的结构体来实现实例化。Broadcast算子模板内部将根据模板参数决定数据将由哪类具体的模板实例计算。`ATVC::BroadcastPolicy`的数据定义如下：
+### 3.2.3 Reduce模板说明
+`ATVC::Kernel::ReduceOpTemplate`是一套基本的Reduce算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的流程。Kernel层的算子模板需要计算模板类作为模板参数传入来完成实例化。在调用阶段，算子类将按照固定参数顺序调用计算模板类的对应接口，完成数据的计算。
+相比Elementwise算子模板不同的是，ReduceOpTemplate内置了不同场景的Reduce实现，并在编译时通过`ATVC::ReducePolicy`类型的结构体来实现实例化。ReduceOpTemplate内部将根据模板参数决定数据将由哪类具体的模板实例计算。`ATVC::ReducePolicy`的数据定义如下：
 
 ```cpp
-struct BroadcastPolicy {
-    public:
-    int32_t patternID = -1;         // 描述需要Broadcast的轴信息
-    int32_t loopABCount = -1;       // 描述Broadcast在多核上的切分信息
-    int32_t loopInnerABCount = -1;  // 描述Broadcast在UB间的切分信息
+struct ReducePolicy {
+    int32_t patternID = -1;             // 描述Reduce轴与数据原本shape之间的关系
+    int32_t loopARCount = -1;           // 描述Reduce轴在多核之间的计算模式
+    int32_t loopInnerARCount = -1;      // 描述Reduce轴在单核之内的计算模式
 };
 ```
 
-下方为`ATVC::Kernel::BroadcastOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[`atvc/include/broadcast/broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h)。
+下方为`ATVC::Kernel::ReduceOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[Reduce模板](../include/reduce/reduce_op_template.h)。
 ```cpp
-#ifndef ATVC_BROADCAST_OP_TEMPLATE_H
-#define ATVC_BROADCAST_OP_TEMPLATE_H
-#include "common/const_def.h"
-#include "broadcast/utils/broadcast_buf_pool.h"
-namespace ATVC {
-namespace Kernel {
 /*!
- * BroadcastCompute: Used to implement element wise operations between tensors when their shapes are inconsistent.
- * By copying data in a dimension of length 1, the shapes of two tensors are aligned to support element wise
- * addition operations.
-*/
-template <class BroadcastCompute, const auto& SelectBroadcastPolicy, class PreCompute = void, class PostCompute = void>
-class BroadcastOpTemplate {
+ * ReduceOpTemplate Generic Reduce operator template.
+ * Reduce operators usually refer to operators that perform reduction operations on elements in tensors,
+ * such as summation and averaging. They can specify several dimensions for reduction calculations,
+ * or reduce all elements to a scalar.
+ */
+template <class ReduceCompute, const auto& SelectReducePolicy, 
+            class PreCompute = void, class PostCompute = void>
+class ReduceOpTemplate {
 public:
-    using DataType = typename BroadcastCompute::DataType;
-    __aicore__ inline BroadcastOpTemplate() {}
+    __aicore__ inline ReduceOpTemplate() {};
 
     /*!
-     * \brief The external running interface of BroadcastOpTemplate mainly completes resource initialization, 
-     *        data migration, calculation scheduling and data migration operations
-     * \param src, GM pointer for input data
-     * \param dst, GM pointer for output data
-     * \param broadcastParam, dynamic parameters of broadcast, including tiling data, workspace, etc
+     * \brief The input order is: input tensor, output tensor, ReduceParam, Other scalars.
+     *        Internally schedule data based on ReduceParam and call ReduceOpTemplate to complete
+     *        the calculation before moving it out to GM.
+     * \param[in] x, GM address of the input tensor.
+     * \param[in] y, GM address of the output tensor.
+     * \param[in] param, tiling data and policy.
+     * \return void.
      */
     template<typename ...Args>
-    __aicore__ inline void Run(Args&&... args)
+    __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param)
     {
         // ...
     }
-
-    AscendC::GlobalTensor<DataType> srcGlobal_;
-    AscendC::GlobalTensor<DataType> dstGlobal_;
-    BroadcastCompute compute_;
-    __gm__ BroadcastParam *param_;
 };
-} // namespace Kernel
-} // namespace ATVC
-#endif // ATVC_BROADCAST_OP_TEMPLATE_H
 ```
 
 
-`ATVC::Kernel::BroadcastOpTemplate`的调用样例如下：
+`ATVC::Kernel::ReduceOpTemplate`的调用样例如下：
+```cpp
+#include "atvc.h"
+// ReduceSum算子的描述：一个输入，一个输出，类型均为float
+using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+
+template <typename Traits, const auto& Policy>
+__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy>();
+    op.Run(x, y, &reduceParam);
+}
+```
+
+
+
+## 3.3 Broadcast算子开发
+
+### 3.3.1 Broadcast模板概述
+ATVC框架提供的Broadcast算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示)：
+![broadcast_dataflow.png](images/broadcast_dataflow.png)
+Broadcast模板算子内部根据数据类型、输入/输出Shape完成某个轴上数据扩充的功能，ATVC将各种计算调度场景抽象为`ATVC::BroadcastPolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::BroadcastPolicy`转化为编译态参数，结合计算模板来实例化`ATVC::Kernel::BroadcastOpTemplate`算子模板类。
+
+根据Broadcast算子在框架内部的交互场景，ATVC提供如下的接口以及模板类帮助开发搭建自定义Broadcast算子：
+![broadcast_components.png](images/broadcast_components.png)
+自定义Broadcast算子需按照以下顺序完成模块之间的组装：
+1. 自定义计算模板/使用框架内置计算模板。
+2. 将计算模板传入Kernel层模板算子完成核函数功能实现。
+3. 定义Kernel层算子入口API，内部实例化计算模板类。
+
+
+### 3.3.2 Broadcast算子开发步骤
+下面将以BroadcastTo（对输入Tensor在特定轴上做数据复制扩充操作）的算子搭建为样例，按照组装顺序介绍Broadcast类算子的关键步骤。通过ATVC框架实现的完整BroadcastCustom算子样例代码见[样例代码](../examples/broadcast_to/broadcast_to.cpp)：
 ```cpp
-#include "atvc.h"
 // BroadcastTo算子的描述：一个输入，一个输出，类型均为float
-using BroadcastOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
 
-template <typename Traits, const auto& Policy>
+/*
+ * 该函数为BroadcastCustom算子核函数入口
+ * x                 Device上的gm地址，指向BroadcastCustom算子第一个输入
+ * y                 Device上的gm地址，指向BroadcastCustom算子第一个输出
+ * broadcastParam    指向运行态ATVC::BroadcastParam数据
+*/
+template<typename Traits, const auto& Policy>
 __global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam broadcastParam)
 {
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 设置算子执行时只启动Vector核
+    ...
     // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
     auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<Traits>, Policy>();
-    op.Run(x, y, &broadcastParam);
+    ATVC::BroadcastParam *param = &broadcastParam;
+    op.Run(x, y, param);
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+    ...
+    ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
+        printf("Broadcast tiling error.\n");
+        return -1;
+    };
+    // 调用Adapter调度接口，完成核函数的模板调用
+    BroadcastOpAdapter<BroadcastOpTraits>(xDevice, yDevice, param, policy, stream);
+    ...
+}
+
+```
+
+#### 3.3.2.1 实现计算逻辑
+Broadcast计算模板是指Broadcast类算子在UB上实现将A轴的数据复制扩充到B轴上。在Kernel层的组装阶段，计算模板将作为模板参数传入`ATVC::Kernel::BroadcastOpTemplate`，并在数据计算阶段被调用。
+下方为ATVC框架内置的`ATVC::BroadcastCompute`计算模板的伪代码介绍，完整代码定义请参考[完整代码](../include/broadcast/broadcast_compute.h)。
+```cpp
+#ifndef ATVC_BROADCAST_COMPUTE_H
+#define ATVC_BROADCAST_COMPUTE_H
+#include "kernel_operator.h"
+#include "common/kernel_utils.h"
+#include "broadcast/common/broadcast_common.h"
+
+namespace ATVC {
+template <class OpTraits>
+class BroadcastCompute {
+public:
+    using inputDTypeList = typename OpTraits::In::types;
+    using DataType = typename ATVC::TypeListGet<inputDTypeList, 0>::Type;
+
+    template <int32_t patternID>
+    __aicore__ inline void Compute(AscendC::LocalTensor<DataType> &src, uint32_t inputOffset,
+                                   AscendC::LocalTensor<DataType> &dst, uint32_t dimA, uint32_t dimB)
+    {
+        // 实现AB、BA两个场景的broadcast计算
+    }
+};
 }
+#endif // ATVC_BROADCAST_COMPUTE_H
 ```
+Broadcast计算模板类将在数据计算阶段被`BroadcastOpTemplate`算子模板调用，因此Broadcast计算模板类的实现必须遵从以下约束：
+
+- 该模板类的`Compute`接口主要实现将UB上某一段输入数据进行B轴上的复制扩充。
+- `Compute`参数说明：
+    * `src`: 是存放UB上的输入数据。
+    * `inputOffset`: 表示本次compute需要计算的一段数据段位于`src`的起始位置偏移量。
+    * `dst`: 是存放计算后的UB上的输出数据。
+    * `dimA`: 表示此次计算需要处理的输入数据长度。
+    * `dimB`: 表示此次计算需要扩充的长度，输入为dimA, 输出数据量为dimA*dimB。
+- `Compute`需要完成AB和BA两种场景的计算：
+    * AB场景的计算：输入`src`是一个Shape为(dimA, 1)的Tensor，需要将数据扩充到`dst`上，dst的Shape是(dimA, dimB)。
+    * BA场景的计算：输入`src`是一个Shape为(1, dimA)的Tensor，需要将src数据扩充到`dst`上，dst的Shape是(dimB, dimA)。
 
-### 2.3.4 核函数定义
-在ATVC提供了Broadcast内部实现后，用户需要定义封装核函数接口。核函数功能可依赖`ATVC::Kernel::BroadcastOpTemplate`的相关接口实现。
+- 该模板类在实例化时固定传入`ATVC::OpTraits`类型的结构体作为模板参数，如`ATVC::OpTraits<ATVC::OpInputs<float>,ATVC::OpOutputs<float>`。
 
-基于`2.3.2`和`2.3.3`的样例代码，Kernel层的自定义API样例如下：
+#### 3.3.2.2 实例化模板
+在ATVC提供了Broadcast内部实现后，用户需要定义封装核函数接口。核函数内部通过`ATVC::Kernel::BroadcastOpTemplate`完成模板实例化。
+Kernel层的自定义API样例如下：
 ```cpp
 #include "atvc.h"
 
@@ -803,13 +851,13 @@ __global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::Broadcast
 <br>
 
 Broadcast算子开发场景下，核函数定义必须遵从以下约束：
-1. 核函数须预留一个GM_ADDR类型的形参用于传入`ATVC::BroadcastParam`运行态参数；
-2. 核函数须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);`这段代码显示标注算子类型；
-3. 核函数须实例化`ATVC::Kernel::BroadcastOpTemplate`变量,实例化时需传入对应的计算实现模板类`ATVC::BroadcastCompute`,并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam* broadcastParam)`接口来实现数据的调度运算；
+1. 核函数须预留一个GM_ADDR类型的形参用于传入`ATVC::BroadcastParam`运行态参数。
+2. 核函数须加入`KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);`这段代码显示标注算子类型。
+3. 核函数须实例化`ATVC::Kernel::BroadcastOpTemplate`变量,实例化时需传入对应的计算实现模板类`ATVC::BroadcastCompute`，并调用它的`Run(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam* broadcastParam)`接口来实现数据的调度运算。
 
-### 2.3.5 Host层API
-#### 2.3.5.1 CalcBroadcastTiling
-`ATVC::Host::CalcBroadcastTiling`是ATVC在Host侧提供的针对Broadcast类算子的通用Tiling API，它以`ATVC::OpTraits`作为模板参数，根据实际输入输出shape计算出对应算子模板需要的静态编译参数`ATVC::BroadcastPolicy`<br>和动态参数`ATVC::BroadcastParam`，其中`ATVC::BroadcastParam`包含了数据计算搬运的`ATVC::BroadcastTilingData`，它们的数据结构定义如下：
+#### 3.3.2.3 Policy与Param的计算与传递
+##### 3.3.2.3.1 CalcBroadcastTiling
+`ATVC::Host::CalcBroadcastTiling`是ATVC在Host侧提供的针对Broadcast类算子的通用Tiling API，它以`ATVC::OpTraits`作为模板参数，根据实际输入输出Shape计算出对应算子模板需要的静态编译参数`ATVC::BroadcastPolicy`<br>和动态参数`ATVC::BroadcastParam`，其中`ATVC::BroadcastParam`包含了数据计算搬运的`ATVC::BroadcastTilingData`，它们的数据结构定义如下：
 
 ```cpp
 //指导Broadcast算子模板内部的数据调度
@@ -879,7 +927,7 @@ using BroadcastOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs
 int32_t main(int32_t argc, char* argv[])
 {
     std::vector<int64_t> shapeIn{1, 1024};    // 测试输入shape
-    std::vector<int64_t> shapeOut{8, 1024};    // 测试输入shape
+    std::vector<int64_t> shapeOut{8, 1024};    // 测试输出shape
     ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
     ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
@@ -892,9 +940,9 @@ int32_t main(int32_t argc, char* argv[])
     ...
 }
 ```
-#### 2.3.5.2 BroadcastOpAdapter
+##### 3.3.2.3.2 BroadcastOpAdapter
 在调用CalcBroadcastTiling接口获取动态参数`ATVC::BroadcastPolicy`后，仍需要在Kernel直调阶段将其转化为编译态参数来实例化`BroadcastOpTemplate`模板完成调用。ATVC框架对`ATVC::BroadcastPolicy`这特殊的模板参数提供了一套运行态参数转编译态参数的策略分派机制`BroadcastOpAdapter`，它将框架`CalcBroadcastTiling`的计算得到的输出作为输入，替用户完成`ATVC::BroadcastParam`资源的申请及释放的同时，将运行态的`ATVC::BroadcastPolicy`参数转化为模板参数完成自定义核函数的策略选择调用。
-基于2.3.3 章节中核函数的接口定义，这里只给出`BroadcastOpAdapter`的示范代码，用户亦可根据实际计算场景替换其中的核函数调用：
+基于[3.3.2.2](#3322-实例化模板)章节中核函数的接口定义，这里只给出`BroadcastOpAdapter`的示范代码，用户亦可根据实际计算场景替换其中的核函数调用：
 
 ```cpp
 // 负责Broadcast类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
@@ -941,45 +989,133 @@ int32_t main(int32_t argc, char* argv[])
 }
 
 ```
+### 3.3.3 Broadcast模板说明
+`ATVC::Kernel::BroadcastOpTemplate`是一套基本的Broadcast算子类，它实现了一套算子数据的搬运搬出、资源分配和释放的流程。Kernel层的算子模板需要计算模板类作为模板参数传入来完成实例化。在调用阶段，Broadcast算子模板将按照固定参数顺序调用计算模板类的`Compute`接口，完成数据的计算。
+与Broadcast算子模板类似，Broadcast算子模板内置了不同场景的Broadcast实现，并在编译时通过`ATVC::BroadcastPolicy`类型的结构体来实现实例化。Broadcast算子模板内部将根据模板参数决定数据将由哪类具体的模板实例计算。`ATVC::BroadcastPolicy`的数据定义如下：
+
+```cpp
+struct BroadcastPolicy {
+    public:
+    int32_t patternID = -1;         // 描述需要Broadcast的轴信息
+    int32_t loopABCount = -1;       // 描述Broadcast在多核上的切分信息
+    int32_t loopInnerABCount = -1;  // 描述Broadcast在UB间的切分信息
+};
+```
+
+下方为`ATVC::Kernel::BroadcastOpTemplate`模板类的外部接口介绍，完整模板类定义请参考[`broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h)。
+```cpp
+#ifndef ATVC_BROADCAST_OP_TEMPLATE_H
+#define ATVC_BROADCAST_OP_TEMPLATE_H
+#include "common/const_def.h"
+#include "broadcast/utils/broadcast_buf_pool.h"
+namespace ATVC {
+namespace Kernel {
+/*!
+ * BroadcastCompute: Used to implement element wise operations between tensors when their shapes are inconsistent.
+ * By copying data in a dimension of length 1, the shapes of two tensors are aligned to support element wise
+ * addition operations.
+*/
+template <class BroadcastCompute, const auto& SelectBroadcastPolicy, class PreCompute = void, class PostCompute = void>
+class BroadcastOpTemplate {
+public:
+    using DataType = typename BroadcastCompute::DataType;
+    __aicore__ inline BroadcastOpTemplate() {}
+
+    /*!
+     * \brief The external running interface of BroadcastOpTemplate mainly completes resource initialization, 
+     *        data migration, calculation scheduling and data migration operations
+     * \param src, GM pointer for input data
+     * \param dst, GM pointer for output data
+     * \param broadcastParam, dynamic parameters of broadcast, including tiling data, workspace, etc
+     */
+    template<typename ...Args>
+    __aicore__ inline void Run(Args&&... args)
+    {
+        // ...
+    }
+
+    AscendC::GlobalTensor<DataType> srcGlobal_;
+    AscendC::GlobalTensor<DataType> dstGlobal_;
+    BroadcastCompute compute_;
+    __gm__ BroadcastParam *param_;
+};
+} // namespace Kernel
+} // namespace ATVC
+#endif // ATVC_BROADCAST_OP_TEMPLATE_H
+```
+
 
+`ATVC::Kernel::BroadcastOpTemplate`的调用样例如下：
+```cpp
+#include "atvc.h"
+// BroadcastTo算子的描述：一个输入，一个输出，类型均为float
+using BroadcastOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
 
-### 2.3.6 完整样例
-通过ATVC框架实现的完整BroadcastCustom算子定义&调用，完整样例代码见链接[样例代码](../examples/broadcast_to/broadcast_to.cpp)
+template <typename Traits, const auto& Policy>
+__global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam broadcastParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<Traits>, Policy>();
+    op.Run(x, y, &broadcastParam);
+}
+```
 
-## 2.4 组合算子开发
+## 3.4 Elementwise + Broadcast组合算子开发
+### 3.4.1 组合模板概述
 ATVC框架支持Broadcast与Elementwise组合的算子通过扩展BroadcastOpTemplate的模板参数对用户提供接口，开发者可以根据算子实际需求来定制组合，框架支持以下组合：Elementwise + Broadcast、Broadcast + Elementwise、Elementwise + Broadcast + Elementwise。下面以Broadcast与Elementwise组合为例进行详细讲解。
 
 组合算子模板类的模块之间的交互如下（ATVC框架提供的模板及接口用黄色表示；开发自定义的模块用蓝色表示）：
 
 ![](images/broadcast_fusion_dataflow.png)
 
-组合算子模板内部根据计算的数据大小，shape完成了不同计算调度代码的实现，考虑到Broadcast的tiling复杂度，组合算子的计算调度场景复用Broadcast的调度策略`ATVC::BroadcastPolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::BroadcastPolicy`转化为编译态参数，结合定制的Elementwise和Broadcast计算模板来实例化`ATVC::Kernel::BroadcastOpTemplate`算子模板类。
-
-### 2.4.1 Components
+组合算子模板内部根据计算的数据大小，Shape完成了不同计算调度代码的实现，考虑到Broadcast的Tiling复杂度，组合算子的计算调度场景复用Broadcast的调度策略`ATVC::BroadcastPolicy`。在算子调用阶段，分派策略API可根据Tiling API计算出的`ATVC::BroadcastPolicy`转化为编译态参数，结合定制的Elementwise和Broadcast计算模板来实例化`ATVC::Kernel::BroadcastOpTemplate`算子模板类。
 根据组合算子在框架内部的交互场景，ATVC提供如下的接口及模板类帮助开发搭建自定义Broadcast与Elementwise组合算子：
 
 需按照以下顺序完成模块之间的组装：
 
-1.自定义前置或后置Elementwise计算模板
+1.自定义前置或后置Elementwise计算模板。
 
 2.自定义Broadcast计算模板/使用框架内置Broadcast计算模板，并组合Elementwise计算模板。
 
-3.将计算模板传入Kernel层模板算子完成核函数功能实现；
+3.将计算模板传入Kernel层模板算子完成核函数功能实现。
 
-4.定义Kernel层算子入口API， 内部实例化计算模板类；
+4.定义Kernel层算子入口API， 内部实例化计算模板类。
 
 下面将以Add算子（Broadcast + Elementwise, 为区别Add单算字，命名为AddWithBroadcast算子）搭建为样例，按照组装顺序介绍组合算子类的开发流程。
 
-### 2.4.2 计算模板
-组合计算模板复用已有的Elementwise计算模板（详见[2.1.2章节](#212-计算模板)）和Broadcst计算模板（参见[2.3.2章节](#232-计算模板)），具体使用方法和约束参考对应章节。
+### 3.4.2 组合算子开发步骤
+下面是用户利用Components模板实现自定义算子所需要实现的关键步骤，完整样例见[add_with_broadcast](..\examples\add_with_broadcast\add_with_broadcast.cpp) :
+```cpp
+// AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
+using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
+int32_t main(int32_t argc, char* argv[])
+{
+    ...
+    ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    // Host侧调用Tiling API完成相关运行态参数的运算
+    param.nBufferNum = 1;
+    if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
+        printf("Broadcast tiling error.\n");
+        return -1;
+    };
+    // 调用Adapter调度接口，完成核函数的模板调用
+    BroadcastOpAdapter<BroadcastOpTraits>(xDevice, yDevice, zDevice, param, policy);
+    ...
+}
+```
+
+#### 3.4.2.1 实现计算逻辑
+组合计算模板复用已有的Elementwise计算模板（详见[3.1.2章节](#312-elementwise算子开发步骤)）和Broadcast计算模板（参见[3.3.2章节](#332-broadcast算子开发步骤)），具体使用方法和约束参考对应章节。
 
 根据实际的算子诉求，构建1个或2个Elementwise计算模板，与1个Broadcast计算模板，作为模板参数传入`ATVC：：BroadcastOpTemplate`中，并在数据计算以及同步阶段被调用。
 
 存在Broadcast的组合计算模板设计多核之间的数据结果同步以及核内分块的对其计算，用户自定义难度较高，因此ATVC框架提供了Broadcast的内置计算模板，并实现了Broadcast在单核内与多核间的计算与同步等函数接口。
-#### 2.4.2.1 Elementwise计算模板
-前置或后置Elementwise计算模板除了需要满足基本计算模板的要求外，还需要定义两个额外接口SetArgs和SetParam，分别用来接受组合算子的向量参数和标量参数。
+##### 3.4.2.1.1 实现Elementwise计算模板
+前置或后置Elementwise计算模板除了需要满足基本计算模板的要求外，还需要定义两个额外接口`SetArgs`和`SetParam`，分别用来接受组合算子的向量参数和标量参数。
 
-以PostCompute为例， Elementwise计算模板定义如下：
+以`PostCompute`为例， Elementwise计算模板定义如下：
 
 ```cpp
 template<typename Traits>
@@ -1015,17 +1151,17 @@ struct PostComputeAddOfBroadcast {
 };
 ```
 
-Elementwise计算模板需要定义三个接口， 分别为：
+Elementwise计算模板需要定义三个接口，分别为：
 
-1、SetParam函数，入参为可变参数，用户调用Kernel函数传递的scaler参数经过Broadcast模板的分拣，会通过SetParam函数传递给每一个计算函数模板。
+1、`SetParam`函数，入参为可变参数，用户调用Kernel函数传递的scaler参数经过Broadcast模板的分拣，会通过`SetParam`函数传递给每一个计算函数模板。
 
-2、SetArgs函数，入参为可变参数，用户调用Kernel函数传递的向量参数经过Broadcast模板的分拣，会根据OpTrats计算每个计算模板需要的参数个数，并通过SetArgs函数按顺序传递给对应的计算模板。
+2、`SetArgs`函数，入参为可变参数，用户调用Kernel函数传递的向量参数经过Broadcast模板的分拣，会根据`OpTraits`计算每个计算模板需要的参数个数，并通过`SetArgs`函数按顺序传递给对应的计算模板。
 
-3、()函数，入参为可变参数。用户调用Kernel函数后，Broadcast模板会判断用户是否有定义PreCompute或PostCompute函数，并用PreCompute替换CopyIn操作，用PostCompute替换CopyOut操作。该函数的参数按顺序主要分为3部分：
+3、`()`函数，入参为可变参数。用户调用Kernel函数后，Broadcast模板会判断用户是否有定义`PreCompute`或`PostCompute`函数，并用`PreCompute`替换`CopyIn`操作，用`PostCompute`替换`CopyOut`操作。该函数的参数按顺序主要分为3部分：
 
-* 计算模板所需要的LocalTensor，包括输入输出和临时Buffer，内存由Broadcast模板申请，用户可以直接使用，参数的个数模板的所有参数个数-1（剩下一个参数必为Broadcast模板的输入或者输出，不需要额外申请）
+* 计算模板所需要的LocalTensor，包括输入输出和临时Buffer，内存由Broadcast模板申请，用户可以直接使用，参数的个数模板的所有参数个数-1（剩下一个参数必为Broadcast模板的输入或者输出，不需要额外申请）。
 * 单个LocalTensor，复用Broadcast模板的输入或者输出。  
-* CopyIn或CopyOut的参数，包括offset和DataCopyParam，为原来CopyIn或CopyOut操作时， 调用AscendC::DataCopy接口所传递的参数， 用户可以在完成自定义计算后，直接使用该参数做DataCopy操作。
+* `CopyIn`或`CopyOut`的参数，包括offset和DataCopyParam，为原来`CopyIn`或`CopyOut`操作时， 调用`AscendC::DataCopy`接口所传递的参数， 用户可以在完成自定义计算后，直接使用该参数做`DataCopy`操作。
   
 
 完整实现见[post_compute_add_of_broadcast.h](../examples/add_with_broadcast/post_compute_add_of_broadcast.h)。
@@ -1033,11 +1169,72 @@ Elementwise计算模板需要定义三个接口， 分别为：
 
 
 
-#### 2.4.2.2 Broadcast计算模板
+##### 3.4.2.1.2 实现Broadcast计算模板
 Broadcast计算模板在组合算子中与Broadcast单算子无任何区别，可直接复用单算子计算模板或使用内置计算模板。
 
-### 2.4.3 BroadcastOpTemplate模板
-Broadcast与Elementwise组合的算子模板以BroadcastOpTemplate为基础进行扩展,`BroadcastOpTemplate`的介绍可以参考章节[2.3.3](#233-内置broadcast算子模板)。下面为组合算子场景`ATVC::Kernel::BroadcastOpTemplate`新引入的接口或定义，以及调用计算模板函数的示意代码，完整模板定义请参考[`atvc/broadcast/broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h)：
+#### 3.4.2.2 实例化模板
+在ATVC提供了Broadcast内部实现后，用户需要定义封装核函数接口。核函数内部通过`ATVC::Kernel::BroadcastOpTemplate`完成模板实例化。
+Kernel层的自定义API样例如下：
+
+```cpp
+// AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
+using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
+
+/* !
+ * \brief z = x + y, the shape of x must be able to be broadcasted to the shape of y
+ * \param [in] x, input global memory of x
+ * \param [in] y, input global memory of y
+ * \param [out] z, output global memory
+ * \param [in] broadcastParam, params of broadcast
+ */
+template <typename Traits, const auto& Policy>
+__global__ __aicore__ void AddWithBroadcastCustom(GM_ADDR x,
+                                                  GM_ADDR y,
+                                                  GM_ADDR z,
+                                                  ATVC::BroadcastParam broadcastParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+
+    // 1. get input and output for kernel op from host Traits
+    using KernelOpIn = typename Traits::In::types;
+    using KernelOpOut = typename Traits::Out::types;
+    using KernelOpTemp = typename Traits::Temp::types;
+
+    // 2. define input and output for broadcast
+    using BroadcastOpInput = ATVC::OpInputs<typename ATVC::TypeListGet<KernelOpIn, 0>::Type>;
+    using BroadcastOpOutput = ATVC::OpOutputs<typename ATVC::TypeListGet<KernelOpTemp, 0>::Type>;
+    using BroadcastOpTraits =  ATVC::OpTraits<BroadcastOpInput, BroadcastOpOutput>;
+
+    // 3. define input and output for post compute
+    using AddOpInput = ATVC::OpInputs<typename ATVC::TypeListGet<KernelOpTemp, 0>::Type, typename ATVC::TypeListGet<KernelOpIn, 1>::Type>;
+    using AddOpOutput = ATVC::OpOutputs<typename ATVC::TypeListGet<KernelOpOut, 0>::Type>;
+    using AddOpTraits = ATVC::OpTraits<AddOpInput, AddOpOutput>;
+    using PostCompute = PostComputeAddOfBroadcast<AddOpTraits>;
+
+    // 4. call op run
+    auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<BroadcastOpTraits>, Policy, void, PostCompute>();
+    ATVC::BroadcastParam *param = &broadcastParam;
+    op.Run(x, y, z, param);
+}
+```
+
+在组合算子中，用户定义的`OpTraits`为组合算子的整体定义，核函数内部根据算子的组合形式，用组合算子的`OpTraits`定义来表达Elementwise和Broadcast算子的`Optraits`定义。例如，Broadcast计算函数的`OpTraits`定义就表示：Broadcast的输入是组合算子的第一个输入，Broadcast算子的输出组合算子的临时资源。
+
+#### 3.4.2.3 Policy与Param的计算与传递
+##### 3.4.2.3.1 CalcBroadcastTiling
+组合类算子的TilingAPI可以复用`ATVC::Host::CalcBroacastTiling`功能，在此框架上引入`PreCompute`和`PostCompute`对应的`Optraits`。具体信息参考[3.3.2.3.1](#33231-calcbroadcasttiling)章节。在计算基本快所需UB空间时，从`PreCompute`和`PostCompute`的`Optraits`萃取对于UB空间的需求，确保不会出现溢出的情况。简单的，如果用户不用对Tiling做扩展，可以直接用组合算子的`Optraits`来计算Tiling，不需要分别传单算子的`Optraits`。
+
+```cpp
+// AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
+using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
+ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param);
+```
+
+##### 3.4.2.3.2 BroadcastAdapter
+`BroadcastAdapter`的介绍可参考章节[3.3.2.3.2](#33232-broadcastopadapter)。
+
+### 3.4.3 BroadcastOpTemplate模板说明
+Broadcast与Elementwise组合的算子模板以BroadcastOpTemplate为基础进行扩展，`BroadcastOpTemplate`的介绍可以参考章节[3.3.3](#333-broadcast模板说明)。下面为组合算子场景`ATVC::Kernel::BroadcastOpTemplate`新引入的接口或定义，以及调用计算模板函数的示意代码，完整模板定义请参考[`broadcast_op_template.h`](../include/broadcast/broadcast_op_template.h)：
 
 ```cpp
 /*!
@@ -1098,75 +1295,9 @@ class BroadcastOpTemplate {
 };
 ```
 
-### 2.4.4 核函数定义
-在ATVC提供了Broadcast内部实现后，用户需要定义封装核函数接口。核函数功能可依赖`ATVC::Kernel::BroadcastOpTemplate`的相关接口实现。
-
-基于`2.4.2`和`2.4.3`的样例代码，Kernel层的自定义API样例如下：
-
-```cpp
-// AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
-using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
-
-/* !
- * \brief z = x + y, the shape of x must be able to be broadcasted to the shape of y
- * \param [in] x, input global memory of x
- * \param [in] y, input global memory of y
- * \param [out] z, output global memory
- * \param [in] broadcastParam, params of broadcast
- */
-template <typename Traits, const auto& Policy>
-__global__ __aicore__ void AddWithBroadcastCustom(GM_ADDR x,
-                                                  GM_ADDR y,
-                                                  GM_ADDR z,
-                                                  ATVC::BroadcastParam broadcastParam)
-{
-    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
-
-    // 1. get input and output for kernel op from host Traits
-    using KernelOpIn = typename Traits::In::types;
-    using KernelOpOut = typename Traits::Out::types;
-    using KernelOpTemp = typename Traits::Temp::types;
-
-    // 2. define input and output for broadcast
-    using BroadcastOpInput = ATVC::OpInputs<typename ATVC::TypeListGet<KernelOpIn, 0>::Type>;
-    using BroadcastOpOutput = ATVC::OpOutputs<typename ATVC::TypeListGet<KernelOpTemp, 0>::Type>;
-    using BroadcastOpTraits =  ATVC::OpTraits<BroadcastOpInput, BroadcastOpOutput>;
-
-    // 3. define input and output for post compute
-    using AddOpInput = ATVC::OpInputs<typename ATVC::TypeListGet<KernelOpTemp, 0>::Type, typename ATVC::TypeListGet<KernelOpIn, 1>::Type>;
-    using AddOpOutput = ATVC::OpOutputs<typename ATVC::TypeListGet<KernelOpOut, 0>::Type>;
-    using AddOpTraits = ATVC::OpTraits<AddOpInput, AddOpOutput>;
-    using PostCompute = PostComputeAddOfBroadcast<AddOpTraits>;
-
-    // 4. call op run
-    auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<BroadcastOpTraits>, Policy, void, PostCompute>();
-    ATVC::BroadcastParam *param = &broadcastParam;
-    op.Run(x, y, z, param);
-}
-```
-
-在组合算子中，用户定义的OpTraits为组合算子的整体定义，核函数内部根据算子的组合形式，用组合算子的OpTraits定义来表达Elementwise和Broadcast算子的Optraits定义。例如，Broadcast计算函数的OpTraits定义就表示：broadcast的输入是组合算子的第一个输入，broadcast算子的输出组合算子的临时资源。
-
-
-### 2.4.5 Host层API
-#### 2.4.5.1 CalcBroadcastTiling
-组合类算子的TilingAPI可以复用`ATVC::Host::CalcBroacastTiling`功能，在此框架上引入PreCompute和PostCompute对应的OpTraits。具体信息参考[2.2.4](#224-核函数定义)章节。在计算基本快所需UB空间时，从PreCompute和PostCompute的OpTraits萃取对于UB空间的需求，确保不会出现溢出的情况。简单的，如果用户不用对Tiling做扩展，可以直接用组合算子的OpTraits来计算Tiling，不需要分别传单算子的OpTraits。
-
-```cpp
-// AddWithBroadcast算子的描述：两个输入，一个输出，类型均为float
-using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
-ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param);
-```
-
-#### 2.4.5.2 BroadcastAdapter
-`BroadcastAdapter`的介绍可参考章节2.3.5.2。
-
-### 2.4.6 完整样例
-参考sample [add_with_broadcast](../examples/add_with_broadcast/README.md)
-
-# 3 ATVC的调试调优功能
+# 4 ATVC的调试调优功能
 为了用户在使用ATVC进行算子开发时能快速进行精度调试和性能调优，ATVC支持多种调试调优能力。
-## 3.1 OpTraits校验接口
+## 4.1 OpTraits校验接口
 用户可通过`DebugCheck()`接口校验不同模板的OpTraits功能, 接口在Host侧调用，无需额外的开关限制,接口定义如下：
 ```cpp
 namespace ATVC {
@@ -1198,18 +1329,18 @@ using AddOpTraits = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs
 
 ATVC::Host::DebugCheck<AddOpTraits, ATVC::TemplateType::ELE_WISE>();
 ```
-完整的DebugCheck调用接口样例可参考tanh_grad算子[样例代码](../examples/tanh_grad/tanh_grad.cpp)。
-## 3.2 使用调试调优模式运行算子
+完整的`DebugCheck`调用接口样例可参考tanh_grad算子[样例代码](../examples/tanh_grad/tanh_grad.cpp)。
+## 4.2 使用调试调优模式运行算子
 样例执行脚本run_examples.sh支持可选入参`--run-mode`进行不同调试调优运行模式的选择。
 当前支持`debug_print`和`profiling`两种模式。
 - `--run-mode=debug_print`：DFX信息打印模式，打开kernel侧的模板内置关键节点的信息打印和异常退出时的打印功能。
 - `--run-mode=profiling`：Profiling性能采集模式，运行时打开profiling性能数据采集功能。
 - 未设置`--run-mode`：默认模式，正常上板，无kernel侧的dfx信息打印， 未开启profiling性能采集功能。
-## 3.2.1 DFX信息打印模式
+### 4.2.1 DFX信息打印模式
 通过运行run_examples.sh脚本时加上可选参数`--run-mode=debug_print`打开本功能。
-DFX信息打印格式按照 [日志级别(`ERROR`/`INFO`)]:[`ATVC`][`Module`](可选：[`CopyIn`/`CopyOut`等])的标准进行打印。
-- 日志级别： ERROR是异常打印信息，INFO是模板内部重要信息打印
-- `ATVC`： 标识是ATVC模板库内置的DFX信息打印
+DFX信息打印格式按照 [日志级别(`ERROR`/`INFO`)]:[`ATVC`][`Module`] (可选：[`CopyIn`/`CopyOut`等]) 的标准进行打印。
+- 日志级别： ERROR是异常打印信息，INFO是模板内部重要信息打印。
+- `ATVC`： 标识是ATVC模板库内置的DFX信息打印。
 - `Module`: 标识是哪个模块的信息打印，例如：`EleWise`、 `Reduce`、`Broadcast`、`Common`等模块。
 - 可选子模块： 用于部分`Module`涉及多个子模块，可选择增加子模块信息，细化DFX信息。
 模板内部提供的DFX信息打印接口定义及使用样例如下所示， 对于普通算子开发用户，无需关注该接口，只有需要修改或者扩展开发模板功能的场景，可使用该接口。
@@ -1226,7 +1357,7 @@ ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n"
 ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_);
 ```
 
-## 3.2.2 开启Profiling性能调优功能
+### 4.2.2 开启Profiling性能调优功能
 通过运行run_examples.sh脚本时加上可选参数`--run-mode=profiling`打开本功能。
 为了增加Profiling采集性能数据的稳定性，建议用户在开启profiling时，运行时重复多次调用kernel，可实现一次性采集多次上板的性能数据，消除抖动。
 ```cpp
@@ -1239,17 +1370,17 @@ TanhGrad<AddOpTraits><<<blockNum, nullptr, stream>>>(dyDevice, yDevice, zDevice,
 ```
 其中`ATVC_DEBUG_MODE`是`run-mode`在不同的模式下的内部宏定义的映射。`ATVC_DEBUG_MODE == 2`是`--run-mode=profiling`的内部映射，用户无需关注。
 
-## 3.3 Tiling超参调优
-### 3.3.1 ElementWise模板算子Tiling超参调优
-#### 3.3.1.1  ElementWise模板通用Tiling算法
-- 计算blockNum：计算blockNum = 总的元素量(totalCnt) / 单核数据量基线(singleCoreBaseLine), blockNum最小值为1， 最大值为平台提供的最大vectorCore值。
-- 计算达到UB上限的单核单输入元素个数值ubLimitCnt：UB上限内存大小 / 所有输入输出及temp单个元素的内存之和。
-- 计算tiledCnt：
-    - 计算每个核需要处理的数据元素量avgElePerBlock = totalCnt / blockNum；
-    - 根据avgElePerBlock所处的splitDataShape数据段，按照切分系数去切分基本块： tiledCnt = dataSplitFactor / dataSplitFactor
-    - tiledCnt调整： 不超上限ubLimitCnt， 不小于下限32，且最后的tiledCnt要做32元素对齐。
-#### 3.3.1.2 ElementWise TilingData定义
-ElementWise模板通用Tiling切分的数据结构为EleWiseTilingData，描述了核间切分和单核内切分的策略，其定义如下：
+## 4.3 Tiling超参调优
+### 4.3.1 Elementwise模板算子Tiling超参调优
+#### 4.3.1.1  Elementwise模板通用Tiling算法
+- 计算`blockNum`：计算`blockNum` = 总的元素量(`totalCnt`) / 单核数据量基线(`singleCoreBaseLine`), `blockNum`最小值为1， 最大值为平台提供的最大`vectorCore`值。
+- 计算达到UB上限的单核单输入元素个数值`ubLimitCnt`：UB上限内存大小 / 所有输入输出及temp单个元素的内存之和。
+- 计算`tiledCnt`：
+    - 计算每个核需要处理的数据元素量`avgElePerBlock = totalCnt / blockNum`。
+    - 根据`avgElePerBlock`所处的`splitDataShape`数据段，按照切分系数去切分基本块： `tiledCnt = dataSplitFactor / dataSplitFactor`。
+    - `tiledCnt`调整： 不超上限`ubLimitCnt`， 不小于下限32，且最后的`tiledCnt`要做32元素对齐。
+#### 4.3.1.2 Elementwise TilingData定义
+Elementwise模板通用Tiling切分的数据结构为EleWiseTilingData，描述了核间切分和单核内切分的策略，其定义如下：
 ```cpp
 namespace ATVC {
 struct EleWiseTilingData {
@@ -1262,8 +1393,8 @@ struct EleWiseTilingData {
 }
 ```
 
-#### 3.3.1.3 ElementWise Tiling超参调优
-当前提供的ElementWise模板内置通用Tiling可调超参如下所示：
+#### 4.3.1.3 Elementwise Tiling超参调优
+当前提供的Elementwise模板内置通用Tiling可调超参如下所示：
 | Tiling超参名     | 数据类型  |  参数说明   |  调节范围 |  默认值 |
 | ----------- | -------------- | ----------- | ----------- |---|
 | singleCoreBaseLine | uint32_t | 单核数据量基线 |  [256, 128 * 1024] | 512|
@@ -1314,11 +1445,11 @@ bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam &param,
         return -1;
     };
 ```
-### 3.3.2 Reduce模板算子Tiling超参调优
-#### 3.3.2.1 Reduce Tiling通用算法
+### 4.3.2 Reduce模板算子Tiling超参调优
+#### 4.3.2.1 Reduce Tiling通用算法
 Reduce Tiling计算流程较为复杂，简化后的主要流程如下：
 ![](images/reduce_tiling.png)
-#### 3.3.2.2 Reduce TilingData定义
+#### 4.3.2.2 Reduce TilingData定义
 ATVC host 和kernel侧都会使用到的`ReduceTilingData`是Reduce的核间AR轴切分、单核内AR轴切分的策略，其定义如下：
 ```cpp
 namespace ATVC {
@@ -1342,7 +1473,7 @@ struct ReduceTilingData {
 };
 }
 ```
-#### 3.3.2.3 Reduce 超参调优
+#### 4.3.2.3 Reduce 超参调优
 可调参数如下所示：
 | Tiling超参名     | 数据类型  |  参数说明   |  调节范围 |  默认值 |
 | ----------- | -------------- | ----------- | ----------- |---|
@@ -1378,7 +1509,3 @@ bool CalcReduceTiling(std::vector<int64_t> inputShape,
 }
 ```
 其中，可选参数`hyperParam`在未传入用户自定义超参时，使用`ReduceTilingHyperParam`的默认值,若用户需要修改某个超参，可自定义`ReduceTilingHyperParam`后传入。
-
-# Copyright
-
-Copyright (c) 2025 Huawei Technologies Co., Ltd.
\ No newline at end of file
diff --git a/atvc/docs/03_code_organization.md b/atvc/docs/03_code_organization.md
index 753745a8..0d5c95ca 100644
--- a/atvc/docs/03_code_organization.md
+++ b/atvc/docs/03_code_organization.md
@@ -1,18 +1,15 @@
-# AscendC Vector算子模板库代码组织结构
+# ATVC代码组织结构
 这篇文档描述了昇腾算子vector算子模板库的代码仓结构，主要包含的内容如下：
-- include包含每层分层的代码头文件
-- examples包含基于模板库的vector算子编程代码示例
-- docs包含昇腾vector算子模板库的相关介绍文档
-## 1. include
-include目录下的头文件是按照如下的文件层级进行组织。
 ```
 include/
 ├── atvc.h                                      // Vector模板编程入口头文件
 ├── common                                      // 不同模板公用API和C++基本类的拓展模板目录
 ├── elewise                                     // Elementwise模板目录
 │   ├── common                                  // Elementwise的公共数据定义 
+│   ├── utils                                   // Elementwise模板辅助工具目录
 │   ├── elewise_op_template.h                   // Elementwise算子模板类
 │   └── elewise_host.h                          // Elementwise算子host侧API
+│   └── elewise_device.h                        // Elementwise算子device头文件集合
 ├── broadcast                                   // Broadcast模板目录
 │   ├── common                                  // Broadcast模板各层公用文件目录
 │   ├── utils                                   // Broadcast模板辅助工具目录
@@ -20,61 +17,12 @@ include/
 │   ├── broadcast_host.h                        // Broadcast算子host侧API
 │   ├── broadcast_op_template.h                 // Broadcast算子模板类
 │   └── broadcast_compute.h                     // Broadcast计算模板
+│   └── broadcast_device.h                      // Broadcast算子device头文件集合
 └── reduce                                      // Reduce模板目录
     ├── common                                  // Reduce模板各层公用文件目录
     ├── utils                                   // Reduce模板辅助工具目录
     ├── tiling                                  // Reduce模板host层目录
     ├── reduce_host.h                           // Reduce算子host侧API
     ├── reduce_op_template.h                    // Reduce算子模板类
-    └── reduce_sum.h                            // ReduceSum计算模板
-```
-## 2. examples
-examples文件夹下提供了算子代码样例，包含算子实现的源码文件和测试用例配置和执行脚本。
-```
-examples
-├── run_examples.sh                             // 执行脚本   
-├── add                                         // EleWise Add算子样例
-│   ├── README.md
-│   └── add.cpp
-├── add_with_scalar                             // EleWise + Scalar算子样例
-│   ├── README.md
-│   └── add_with_scalar.cpp
-├── add_with_broadcast                          // BroadcastTo + EleWise算子样例
-│   ├── README.md
-│   └── add_with_scalar.cpp
-├── broadcast_to                                // BroadcastTo算子样例
-│   ├── README.md
-│   └── broadcast_to.cpp
-├── reduce_sum                                  // ReduceSum算子样例
-│   ├── README.md
-│   └── reduce_sum.cpp
-├── sinh_custom                                 // SinhCustom算子样例
-│   ├── README.md
-│   └── sinh_custom.cpp
-├── tanh_grad                                   // TanhGrad算子样例
-│   ├── README.md
-│   └── tanh_grad.cpp
-├── ops_aclnn                                   // 单算子API调用样例
-|   ├── README.md
-│   ├── add
-│   └── reduce_sum
-├── ops_pytorch                                 // PyTorch框架调用样例
-|   ├── README.md
-|   ├── add
-|   └── reduce_sum
-└── common                                      // 算子样例公共接口
-```
-
-## 3. docs
-docs文件夹下包含项目的所有文档。
-```
-doc/
-├── 01_quick_start.md        // ATVC快速上手指南
-├── 02_developer_guide.md    // 开发指南
-├── 03_code_organization.md  // 目录结构说明
-└── images                   // 图片
-```
-
-# Copyright
-
-Copyright (c) 2025 Huawei Technologies Co., Ltd.
\ No newline at end of file
+    ├── reduce_sum.h                            // ReduceSum计算模板
+    └── reduce_device.h                         // Reduce算子device头文件集合
diff --git a/atvc/examples/README.md b/atvc/examples/README.md
new file mode 100644
index 00000000..8539c305
--- /dev/null
+++ b/atvc/examples/README.md
@@ -0,0 +1,12 @@
+# 样例介绍
+| 样例名                                                         | 描述                                                         |      模板           | 算子调用方式|
+| ------------------------------------------------------------ | ------------------------------------------------------------- | --------------- | ------------------ |
+| [add](./add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例                                                    |     Elementwise             | Kernel直调 |
+| [sinh_custom](./sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例                     |      Elementwise              | Kernel直调 |
+| [add_with_scalar](./add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例                 |       Elementwise             | Kernel直调 |
+| [reduce_sum](./reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例                        |       Reduce              | Kernel直调 |
+| [broadcast_to](./broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例             |       Broadcast             | Kernel直调 |
+| [tanh_grad](./tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例                         |        Elementwise           | Kernel直调 |
+| [ops_aclnn](./ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例                                                    |                       | 单算子API调用 |
+| [ops_pytorch](./ops_pytorch) | 使用ATVC开发自定义算子，并实现从[PyTorch](https://gitee.com/ascend/pytorch)框架调用的样     |                      | PyTorch框架调用 |
+| [add_with_broadcast](./add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例                |       Broadcast              | Kernel直调 |
\ No newline at end of file
diff --git a/atvc/examples/add/README.md b/atvc/examples/add/README.md
index 532af72d..9701555a 100644
--- a/atvc/examples/add/README.md
+++ b/atvc/examples/add/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了利用ATVC实现Add单算子并完成功能验证
+样例概述：本样例介绍了利用ATVC实现Add单算子并完成功能验证
+- 算子功能：add
+- 使用的ATVC模板：Elementwise
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [add.cpp](./add.cpp) | Add算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -35,13 +33,15 @@ Add算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">AddCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [add.cpp](./add.cpp) | Add算子代码实现以及调用样例               |
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh add
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh add
 ```
\ No newline at end of file
diff --git a/atvc/examples/add_with_broadcast/README.md b/atvc/examples/add_with_broadcast/README.md
index cb3620c0..910e8d2f 100644
--- a/atvc/examples/add_with_broadcast/README.md
+++ b/atvc/examples/add_with_broadcast/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了利用ATVC实现带广播的Add单算子并完成功能验证
+样例概述：本样例介绍了利用ATVC实现带广播的Add单算子并完成功能验证
+- 算子功能：add
+- 使用的ATVC模板：带后置Elementwise计算的Broadcast模板
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [add_with_broadcast.cpp](./add_with_broadcast.cpp) | Add算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -35,13 +33,18 @@ Add算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">AddWithBroadcastCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [add_with_broadcast.cpp](./add_with_broadcast.cpp) | Add算子代码实现以及调用样例               |
+| [add_with_broadcast.h](./add_with_broadcast.h) | Add算子代码实现头文件           |
+| [post_compute_add_of_broadcast.h](./post_compute_add_of_broadcast.h) | 后置Elementwise计算              |
+
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh add_with_broadcast
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh add_with_broadcast
 ```
\ No newline at end of file
diff --git a/atvc/examples/add_with_scalar/README.md b/atvc/examples/add_with_scalar/README.md
index 9c516400..7b702cdf 100644
--- a/atvc/examples/add_with_scalar/README.md
+++ b/atvc/examples/add_with_scalar/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了如何利用ATVC搭建标量参与计算的自定义算子并完成算子验证。
+样例概述：本样例介绍了如何利用ATVC搭建标量参与计算的自定义算子并完成算子验证。
+- 算子功能：标量参与计算的自定义算子
+- 使用的ATVC模板：Elementwise
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [add_with_scalar.cpp](./add_with_scalar.cpp) | 自定义算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -37,13 +35,16 @@
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">AddCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [add_with_scalar.cpp](./add_with_scalar.cpp) | 自定义算子代码实现以及调用样例               |
+
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh add_with_scalar
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh add_with_scalar
 ```
\ No newline at end of file
diff --git a/atvc/examples/broadcast_to/README.md b/atvc/examples/broadcast_to/README.md
index cfb7c0c8..48908472 100644
--- a/atvc/examples/broadcast_to/README.md
+++ b/atvc/examples/broadcast_to/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了利用ATVC实现BroadcastTo单算子并完成功能验证
+样例概述：本样例介绍了利用ATVC实现BroadcastTo单算子并完成功能验证
+- 算子功能：对输入tensor的指定轴进行广播计算并输出结果
+- 使用的ATVC模板：Broadcast
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [broadcast_to.cpp](./broadcast_to.cpp) | BroadcastTo算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -34,13 +32,16 @@ BroadcastTo算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">BroadcastCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [broadcast_to.cpp](./broadcast_to.cpp) | BroadcastTo算子代码实现以及调用样例               |
+
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh broadcast_to
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh broadcast_to
 ```
\ No newline at end of file
diff --git a/atvc/examples/ops_aclnn/README.md b/atvc/examples/ops_aclnn/README.md
index f317fdaa..73fbbacd 100644
--- a/atvc/examples/ops_aclnn/README.md
+++ b/atvc/examples/ops_aclnn/README.md
@@ -1,13 +1,29 @@
 ## 概述
 使用ATVC框架开发自定义算子，并实现单算子API调用的样例。
 
-## 算子开发样例
-|  目录名称                                                   |  功能描述                                               |
+## 样例介绍
+|  样例名称                                                   |  功能描述                                               |
 | ------------------------------------------------------------ | ---------------------------------------------------- |
 | [add](./add) |  使用ATVC框架开发自定义算子Add，并实现单算子API调用的样例。 |
 | [reduce_sum](./reduce_sum) | 使用ATVC框架开发自定义算子ReduceSum，并实现单算子API调用的样例。 |
 
-## 基于ATVC框架支持自定义算子
+## 目录结构介绍
+```
+ops_aclnn/
+├── add
+│   ├── AclNNInvocationNaive
+│   ├── AddCustom
+│   ├── AddCustom.json
+│   └── install.sh
+├── reduce_sum
+│   ├── AclNNInvocationNaive
+│   ├── ReduceSumCustom
+│   ├── install.sh
+│   └── ReduceSumCustom.json
+└── README.md
+```
+
+## 开发步骤
 ### 步骤1. 生成自定义工程基础目录及文件
   参考[msopgen](https://www.hiascend.com/document/detail/zh/mindstudio/81RC1/ODtools/Operatordevelopmenttools/atlasopdev_16_0021.html)创建算子工程的基础文件。 
   ```bash 
@@ -18,20 +34,20 @@
   生成目录结构如下：
   ```
     CustomOp
-    ├── build.sh         // 编译入口脚本
+    ├── build.sh                    // 编译入口脚本
     ├── cmake 
     │   ├── config.cmake
-    │   ├── util        // 算子工程编译所需脚本及公共编译文件存放目录
-    ├── CMakeLists.txt   // 算子工程的CMakeLists.txt
-    ├── CMakePresets.json // 编译配置项
-    ├── framework        // 算子插件实现文件目录，单算子模型文件的生成不依赖算子适配插件，无需关注
-    ├── op_host                      // Host侧实现文件
-    │   ├── add_custom_tiling.h    // 算子tiling定义文件
-    │   ├── add_custom.cpp         // 算子原型注册、shape推导、信息库、tiling实现等内容文件
+    │   ├── util                    // 算子工程编译所需脚本及公共编译文件存放目录
+    ├── CMakeLists.txt              // 算子工程的CMakeLists.txt
+    ├── CMakePresets.json           // 编译配置项
+    ├── framework                   // 算子插件实现文件目录，单算子模型文件的生成不依赖算子适配插件，无需关注
+    ├── op_host                     // Host侧实现文件
+    │   ├── add_custom_tiling.h     // 算子tiling定义文件
+    │   ├── add_custom.cpp          // 算子原型注册、shape推导、信息库、tiling实现等内容文件
     │   ├── CMakeLists.txt
     ├── op_kernel                   // Kernel侧实现文件
     │   ├── CMakeLists.txt   
-    │   ├── add_custom.cpp        // 算子代码实现文件 
+    │   ├── add_custom.cpp          // 算子代码实现文件 
     ├── scripts                     // 自定义算子工程打包相关脚本所在目录
   ```
 
@@ -233,8 +249,8 @@
 
   在算子工程目录下执行如下命令，进行算子工程编译。  
   ```bash
-  $ cd CustomOp
-  $ bash build.sh
+  cd CustomOp
+  bash build.sh
   ```
 脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_<target os>_<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
 
diff --git a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh
index ca741d02..b085a061 100644
--- a/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh
+++ b/atvc/examples/ops_aclnn/add/AclNNInvocationNaive/run.sh
@@ -7,7 +7,6 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 if [ -n "$ASCEND_INSTALL_PATH" ]; then
     _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
diff --git a/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/CMakeLists.txt b/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/CMakeLists.txt
index c58e5e11..178359e4 100644
--- a/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/CMakeLists.txt
+++ b/atvc/examples/ops_aclnn/add/AddCustom/op_kernel/CMakeLists.txt
@@ -8,5 +8,5 @@ if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
     add_ops_compile_options(ALL OPTIONS -g -O0)
 endif()
 
-add_ops_compile_options(ALL OPTIONS -w -I ${ATVC_PATH})
+add_ops_compile_options(ALL OPTIONS -g -O0 --cce-aicore-block-local-init -w -I ${ATVC_PATH})
 add_kernels_compile()
\ No newline at end of file
diff --git a/atvc/examples/ops_aclnn/add/README.md b/atvc/examples/ops_aclnn/add/README.md
index d2909e71..7574702d 100644
--- a/atvc/examples/ops_aclnn/add/README.md
+++ b/atvc/examples/ops_aclnn/add/README.md
@@ -60,7 +60,7 @@ CANN软件包中提供了工程创建工具msOpGen，AddCustom算子工程可通
   - 导入ATVC环境变量
     ```bash
     # 如果不导入，默认使用./atvc/include路径
-    $ export ATVC_PATH=${atvc}/include
+    export ATVC_PATH=${atvc}/include
     ```
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
diff --git a/atvc/examples/ops_aclnn/add/install.sh b/atvc/examples/ops_aclnn/add/install.sh
index b6342d6c..2e41482d 100644
--- a/atvc/examples/ops_aclnn/add/install.sh
+++ b/atvc/examples/ops_aclnn/add/install.sh
@@ -7,7 +7,6 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 SHORT=v:,i:,
 LONG=soc-version:,install-path:,
diff --git a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh
index f320ed84..a950a390 100644
--- a/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh
+++ b/atvc/examples/ops_aclnn/reduce_sum/AclNNInvocationNaive/run.sh
@@ -7,7 +7,6 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 if [ -n "$ASCEND_INSTALL_PATH" ]; then
     _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
diff --git a/atvc/examples/ops_aclnn/reduce_sum/README.md b/atvc/examples/ops_aclnn/reduce_sum/README.md
index 04a21d49..01c440d5 100644
--- a/atvc/examples/ops_aclnn/reduce_sum/README.md
+++ b/atvc/examples/ops_aclnn/reduce_sum/README.md
@@ -57,12 +57,12 @@ CANN软件包中提供了工程创建工具msOpGen，ReduceSumCustom算子工程
   - 导入ATVC环境变量
     ```bash
     # 如果不导入，默认使用./atvc/include路径
-    $ export ATVC_PATH=${atvc}/include
+    export ATVC_PATH=${atvc}/include
     ```
   - 切换到msOpGen脚本install.sh所在目录
     ```bash
     # 若开发者以git命令行方式clone了master分支代码，并切换目录
-    cd ./atvc/examples/ops_aclnn/reduce_sum
+    cd ./ops_templates/atvc/examples/ops_aclnn/reduce_sum
     ```
 
   - 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
diff --git a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt
index 76172559..5ed61571 100644
--- a/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt
+++ b/atvc/examples/ops_aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt
@@ -8,5 +8,5 @@ if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
     add_ops_compile_options(ALL OPTIONS -g -O0)
 endif()
 
-add_ops_compile_options(ALL OPTIONS -w -I ${ATVC_PATH})
+add_ops_compile_options(ALL OPTIONS --cce-aicore-block-local-init -w -I ${ATVC_PATH})
 add_kernels_compile()
\ No newline at end of file
diff --git a/atvc/examples/ops_aclnn/reduce_sum/install.sh b/atvc/examples/ops_aclnn/reduce_sum/install.sh
index c9bf32e5..a3a01bc8 100644
--- a/atvc/examples/ops_aclnn/reduce_sum/install.sh
+++ b/atvc/examples/ops_aclnn/reduce_sum/install.sh
@@ -7,7 +7,6 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 SHORT=v:,i:,
 LONG=soc-version:,install-path:,
diff --git a/atvc/examples/ops_pytorch/README.md b/atvc/examples/ops_pytorch/README.md
index 20e1299d..83022bd0 100644
--- a/atvc/examples/ops_pytorch/README.md
+++ b/atvc/examples/ops_pytorch/README.md
@@ -1,13 +1,29 @@
 ## 概述
 使用ATVC开发自定义算子，并实现从PyTorch框架调用的样例。
 
-## 算子开发样例
-|  目录名称                                                   |  功能描述                                              |
+## 样例介绍
+|  样例名称                                                   |  功能描述                                              |
 | ------------------------------------------------------------ | ---------------------------------------------------- |
 | [add](./add) | 基于ATVC框架的Add自定义Vector算子 |
 | [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子 | 
 
-## 基于PyTorch算子对接ATVC框架
+## 目录结构介绍
+```
+ops_pytorch/
+├── add
+│   ├── add_custom_impl.h
+│   ├── pytorch_ascendc_extension.cpp
+│   ├── run_op.py
+│   └── run.sh
+├── reduce_sum
+│   ├── pytorch_ascendc_extension.cpp
+│   ├── reduce_sum_impl.h
+│   ├── run_op.py
+│   └── run.sh
+└── README.md
+```
+
+## 开发步骤
 
   不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择，用户在此处通过`<<<>>>`的方式调用核函数，更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。
   
diff --git a/atvc/examples/ops_pytorch/add/README.md b/atvc/examples/ops_pytorch/add/README.md
index a410bbb7..de80b8e3 100644
--- a/atvc/examples/ops_pytorch/add/README.md
+++ b/atvc/examples/ops_pytorch/add/README.md
@@ -133,12 +133,12 @@ z = x + y
   - 导入ATVC环境变量
     ```bash
     # 如果不导入，默认使用./atvc/include路径
-    $ export ATVC_PATH=${atvc}/include
+    export ATVC_PATH=${atvc}/include
     ```
   - 调用脚本，生成PyTorch算子，并运行测试用例 
       ```bash
-      $ cd ./atvc/examples/ops_pytorch/add
-      $ bash run.sh
+      cd ./ops_templates/atvc/examples/ops_pytorch/add
+      bash run.sh
       ...
       OK
       ```    
diff --git a/atvc/examples/ops_pytorch/add/run.sh b/atvc/examples/ops_pytorch/add/run.sh
index f08ce377..96932ed3 100644
--- a/atvc/examples/ops_pytorch/add/run.sh
+++ b/atvc/examples/ops_pytorch/add/run.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
-# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
 # This file is a part of the CANN Open Software.
-# Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 # Please refer to the License for details. You may not use this file except in compliance with the License.
 # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 torch_location=$(python3 -c "import torch; print(torch.__path__[0])")
 torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])")
@@ -26,20 +25,8 @@ rm -rf *.json
 rm -rf libascendc_pytorch.so
 
 
-if [ -n "$ASCEND_INSTALL_PATH" ]; then
-    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
-elif [ -n "$ASCEND_HOME_PATH" ]; then
-    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
-else
-    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
-        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
-    else
-        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
-    fi
-fi
-
-bisheng -x cce pytorch_ascendc_extension.cpp \
-    -D_GLIBCXX_USE_CXX11_ABI=0  \
+bishengcc pytorch_ascendc_extension.cpp \
+    -arch Ascend910B1   \
     -I${torch_location}/include   \
     -I${torch_location}/include/torch/csrc/api/include   \
     -I${python_include}   \
@@ -51,8 +38,8 @@ bisheng -x cce pytorch_ascendc_extension.cpp \
     -L${lib_path}   \
     -L${_ASCEND_INSTALL_PATH}/lib64 \
     -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
-    -shared -cce-enable-plugin --cce-aicore-arch=dav-c220 -fPIC -ltiling_api -lplatform -lm -ldl  \
-    -o libascendc_pytorch.so
+    -o libascendc_pytorch.so   \
+    -shared
 
 python3 run_op.py
 
diff --git a/atvc/examples/ops_pytorch/reduce_sum/README.md b/atvc/examples/ops_pytorch/reduce_sum/README.md
index c392188b..5123ab7b 100644
--- a/atvc/examples/ops_pytorch/reduce_sum/README.md
+++ b/atvc/examples/ops_pytorch/reduce_sum/README.md
@@ -161,12 +161,12 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结
   - 导入ATVC环境变量
     ```bash
     # 如果不导入，默认使用./atvc/include路径
-    $ export ATVC_PATH=${atvc}/include
+    export ATVC_PATH=${atvc}/include
     ```
   - 调用脚本，生成PyTorch算子，并运行测试用例 
     ```bash
-    $ cd ./atvc/examples/ops_pytorch/reduce_sum
-    $ bash run.sh
+    cd ./ops_templates/atvc/examples/ops_pytorch/reduce_sum
+    bash run.sh
     ...
     OK
     ```    
diff --git a/atvc/examples/ops_pytorch/reduce_sum/run.sh b/atvc/examples/ops_pytorch/reduce_sum/run.sh
index d0d27305..96932ed3 100644
--- a/atvc/examples/ops_pytorch/reduce_sum/run.sh
+++ b/atvc/examples/ops_pytorch/reduce_sum/run.sh
@@ -1,13 +1,12 @@
 #!/bin/bash
 # Copyright (c) 2025 Huawei Technologies Co., Ltd.
 # This file is a part of the CANN Open Software.
-# Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 # Please refer to the License for details. You may not use this file except in compliance with the License.
 # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
-set -e
 
 torch_location=$(python3 -c "import torch; print(torch.__path__[0])")
 torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])")
@@ -26,20 +25,8 @@ rm -rf *.json
 rm -rf libascendc_pytorch.so
 
 
-if [ -n "$ASCEND_INSTALL_PATH" ]; then
-    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
-elif [ -n "$ASCEND_HOME_PATH" ]; then
-    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
-else
-    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
-        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
-    else
-        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
-    fi
-fi
-
-bisheng -x cce pytorch_ascendc_extension.cpp \
-    -D_GLIBCXX_USE_CXX11_ABI=0  \
+bishengcc pytorch_ascendc_extension.cpp \
+    -arch Ascend910B1   \
     -I${torch_location}/include   \
     -I${torch_location}/include/torch/csrc/api/include   \
     -I${python_include}   \
@@ -51,8 +38,8 @@ bisheng -x cce pytorch_ascendc_extension.cpp \
     -L${lib_path}   \
     -L${_ASCEND_INSTALL_PATH}/lib64 \
     -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
-    -shared -cce-enable-plugin --cce-aicore-arch=dav-c220 -fPIC -ltiling_api -lplatform -lm -ldl  \
-    -o libascendc_pytorch.so
+    -o libascendc_pytorch.so   \
+    -shared
 
 python3 run_op.py
 
diff --git a/atvc/examples/reduce_sum/README.md b/atvc/examples/reduce_sum/README.md
index ca4cc755..aafd5ebf 100644
--- a/atvc/examples/reduce_sum/README.md
+++ b/atvc/examples/reduce_sum/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了利用ATVC实现ReduceSum单算子并完成功能验证
+样例概述：本样例介绍了利用ATVC实现ReduceSum单算子并完成功能验证
+- 算子功能：对输入tensor的指定轴进行规约累加的计算并输出结果
+- 使用的ATVC模板：Reduce
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [reduce_sum.cpp](./reduce_sum.cpp) | ReduceSum算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -34,13 +32,16 @@ ReduceSum算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_sum.cpp](./reduce_sum.cpp) | ReduceSum算子代码实现以及调用样例               |
+
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh reduce_sum
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh reduce_sum
 ```
\ No newline at end of file
diff --git a/atvc/examples/run_examples.sh b/atvc/examples/run_examples.sh
index 4cd36033..e5a6282f 100644
--- a/atvc/examples/run_examples.sh
+++ b/atvc/examples/run_examples.sh
@@ -1,29 +1,29 @@
 #!/bin/bash
-# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
 # This file is a part of the CANN Open Software.
-# Licensed under CANN Open Software License Agreement Version 2.0 (the "License").
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
 # Please refer to the License for details. You may not use this file except in compliance with the License.
 # THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
 
-set -e
-
 CURRENT_DIR=$(
     cd $(dirname ${BASH_SOURCE:-$0})
     pwd
 )
 
-if command -v bisheng; then
-    COMPILE_TOOL=bisheng
+if command -v bishengcc; then
+    COMPILE_TOOL=bishengcc
+elif command -v ascc; then
+    COMPILE_TOOL=ascc
 else
-    echo "Error: Cannot find bisheng compiling tool, please check cann package version or set up envrionment first."
+    echo "Error: Cannot find bishengcc/ascc compiling tool, please check cann package version or set up envrionment first."
     exit 1
 fi
 
 ATVC_HOME_DIR=$CURRENT_DIR/../
-TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples | grep -v '^run_examples.sh$' | grep -v '^ops_*' | grep -v '^common*' | xargs)
+TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples | grep -v '^run_examples.sh$' | grep -v '^ops_*' | xargs)
 if [ $# -lt 1 ]; then
     echo "This script requires an input as the test case name. Execution example: 'bash run_examples.sh [$TEST_CASE_LIST]'"
     exit 1
@@ -43,27 +43,16 @@ function parse_run_mode(){
 
 # 根据不同run-mode执行不同的操作
 function compile_operator(){
-    if [ -n "$ASCEND_INSTALL_PATH" ]; then
-        _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
-    elif [ -n "$ASCEND_HOME_PATH" ]; then
-        _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
-    else
-        if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
-            _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
-        else
-            _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
-        fi
-    fi
     cd $ATVC_HOME_DIR/examples/$TEST_NAME
     if [ -z "$RUN_MODE" ]; then
         echo "Executing with npu mode"
-        ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common
     elif [ "$RUN_MODE" = "debug_print" ]; then
         echo "Executing with debug_print mode"
-        ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common  -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64 -DATVC_DEBUG_MODE=1
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=1
     elif [ "$RUN_MODE" = "profiling" ]; then
         echo "Executing with profiling mode"
-        ${COMPILE_TOOL} -x cce -cce-enable-plugin --cce-aicore-arch=dav-c220 -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common  -ltiling_api -lplatform -lm -ldl -L${_ASCEND_INSTALL_PATH}/lib64 -DATVC_DEBUG_MODE=2
+        ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=2
     else
         echo "--npu-mode is an optional parameter and can be left unset. If set, the value must be debug_print or profiling."
         echo "Execution example: 'bash run_examples.sh $TEST_NAME --run-mode=debug_print'"
diff --git a/atvc/examples/sinh_custom/README.md b/atvc/examples/sinh_custom/README.md
index 4c007f46..a1e979b3 100644
--- a/atvc/examples/sinh_custom/README.md
+++ b/atvc/examples/sinh_custom/README.md
@@ -2,17 +2,15 @@
 
 ## 概述
 
-本样例介绍了如何利用ATVC实现临时Tensor参与计算的SinhCustom单算子并完成算子验证
+样例概述：本样例介绍了如何利用ATVC实现临时Tensor参与计算的SinhCustom单算子并完成算子验证
+- 算子功能：sinh
+- 使用的ATVC模板：Elementwise
+- 调用方式：Kernel直调
 
 
 ## 样例支持产品型号：
-- Atlas A2训练系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品/A200I A2 Box 异构组件
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [sinh_custom.cpp](./sinh_custom.cpp) | SinhCustom算子代码实现以及调用样例               |
 
 ## 算子描述
 
@@ -34,13 +32,16 @@ SinhCustom算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">SinhCustom</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [sinh_custom.cpp](./sinh_custom.cpp) | SinhCustom算子代码实现以及调用样例               |
+
+
 ## 算子运行
 在ascendc-api-adv代码仓目录下执行：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh sinh_custom
-...
-Generate golden data successfully.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh sinh_custom
 ```
\ No newline at end of file
diff --git a/atvc/examples/tanh_grad/README.md b/atvc/examples/tanh_grad/README.md
index 147de3cf..f7d26e98 100644
--- a/atvc/examples/tanh_grad/README.md
+++ b/atvc/examples/tanh_grad/README.md
@@ -2,15 +2,12 @@
 
 ## 概述
 
-本样例介绍了利用ATVC实现Tanh单算子并验证了调试调优相关功能验证。
+样例概述：本样例介绍了利用ATVC实现Tanh单算子并验证了调试调优相关功能验证。
+- 算子功能：tanh
+- 使用的ATVC模板：Elementwise
+- 调用方式：Kernel直调
 
 
-## 目录结构
-
-| 文件名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [tanh_grad.cpp](./tanh_grad.cpp) | Tanh算子代码实现以及调用样例               |
-
 ## 算子描述
 
 Tanh算子数学计算公式：$z = dy * (1 - y ^ 2)$
@@ -32,13 +29,18 @@ Tanh算子规格：
 <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">TanhGrad</td></tr>
 </table>
 
+## 目录结构
+
+| 文件名                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [tanh_grad.cpp](./tanh_grad.cpp) | Tanh算子代码实现以及调用样例               |
+
+
 ## 算子基本功能验证
 执行命令如下：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh tanh_grad
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh tanh_grad
 ```
 
 ## 算子调试调优
@@ -47,30 +49,15 @@ Accuracy verification passed.
 - 使用`--run-mode=debug_print`进行DFX信息打印：
 执行命令如下：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh tanh_grad --run-mode=debug_print
-...
-[INFO]:[ATVC][EleWise]Start to run Template Fuction.
-...
-[INFO]:[ATVC][EleWise] Tiling data: blockNum = 8
-...
-[INFO]:[ATVC][EleWise][CopyIn]: Offset is 7168, copy count is 256.
-...
-[INFO]:[ATVC][EleWise]End to run Template Fuction.
-...
-Accuracy verification passed.
+cd ./ops_templates/atvc/examples
+bash run_examples.sh tanh_grad --run-mode=debug_print
 ```
 
 - 使用`--run-mode=profiling`开启Profiling，获取性能数据：
 执行命令如下：
 ```bash
-$ cd ./atvc/examples
-$ bash run_examples.sh tanh_grad --run-mode=profiling
-...
-[INFO] Start Profiling ...
-...
-[INFO] Process profiling data complete, Data is saved in /xxx_path
-...
+cd ./ops_templates/atvc/examples
+bash run_examples.sh tanh_grad --run-mode=profiling
 ```
 
 更多详细的调试调优介绍参考[ATVC开发指南](../../docs/02_developer_guide.md)的`ATVC的调试调优功能`章节
\ No newline at end of file
diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h
index 01ba82e9..7a4a08b2 100644
--- a/atvc/include/broadcast/broadcast_op_template.h
+++ b/atvc/include/broadcast/broadcast_op_template.h
@@ -82,7 +82,7 @@ public:
     template<typename ...Args>
     __aicore__ inline void Run(Args&&... args)
     {
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Start to run template function.\n");
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Start to run template function.\n");
         constexpr size_t PRE_ARGS_COUNT = HAS_PRE_COMPUTE ? PreInputCount + PreOutputCount - BroadcastInputCount : 0;
         constexpr size_t BROADCAST_ARGS_COUNT = BroadcastInputCount + BroadcastOutputCount - HAS_PRE_COMPUTE - HAS_POST_COMPUTE;
         constexpr size_t POST_ARGS_COUNT = HAS_POST_COMPUTE ? PostInputCount + PostOutputCount - BroadcastOutputCount : 0;
@@ -99,8 +99,7 @@ public:
             return;
         }
         this->Process();
-        pipeIn.Destroy();
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] End to run template function.\n");
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Template function execution completed.\n");
     }
 
     /*!
@@ -163,10 +162,11 @@ private:
         if (HAS_POST_COMPUTE) {
             outputCount_ = PostInputCount + PostTempCount + PostOutputCount;
         }        
-        bufPool_.template Init<DataType>(inputCount_,  // The number of inputs required for double buffer
-                                         outputCount_,  // The number of calculation results is generally consistent with inputNum
-                                         tilingData_->A2 * tilingData_->A12 * DATA_SIZE,  // Input Tensor size
-                                         tilingData_->A2 * tilingData_->B2 * DATA_SIZE);  // Output Tensor Size
+        bufPool_.template Init<DataType>(GetTPipePtr(),
+                                    inputCount_, // The number of inputs required for double buffer
+                                    outputCount_, // The number of calculation results is generally consistent with inputNum
+                                    tilingData_->A2 * tilingData_->A12 * DATA_SIZE, // Input Tensor size
+                                    tilingData_->A2 * tilingData_->B2 * DATA_SIZE); // Output Tensor Size
     }
 
     template <class... Args>
@@ -427,7 +427,6 @@ private:
 
     GM_ADDR src_;
     GM_ADDR dst_;
-    AscendC::TPipe pipeIn;
     AscendC::GlobalTensor<DataType> srcGlobal_;
     AscendC::GlobalTensor<DataType> dstGlobal_;
     BroadcastCompute compute_;
diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h
index e73dca97..0dd827cb 100644
--- a/atvc/include/broadcast/tiling/broadcast_tiling.h
+++ b/atvc/include/broadcast/tiling/broadcast_tiling.h
@@ -137,16 +137,16 @@ private:
         size_t sizeIn = shapeIn.size();
         size_t sizeOut = shapeOut.size();
         if (sizeOut != sizeIn) {
-            printf("input dim in is not equal to output dim!\n");
+            printf("[ERROR]: [ATVC][Broadcast] input dim in is not equal to output dim!\n");
             return false;
         };
 
         for (size_t i = 0; i < sizeIn; i++) {
             if (shapeOut[i] != shapeIn[i] && shapeIn[i] != 1) {
-                printf("Input shape in broadcast dim should be 1\n");
+                printf("[ERROR]: [ATVC][Broadcast] Input shape in broadcast dim should be 1\n");
                 return false;
             } else if (shapeIn[i] <= 0) {
-                printf("Input and output shape should be more than 0\n");
+                printf("[ERROR]: [ATVC][Broadcast] Input and output shape should be more than 0\n");
                 return false;
             }
         }
diff --git a/atvc/include/broadcast/utils/broadcast_buf_pool.h b/atvc/include/broadcast/utils/broadcast_buf_pool.h
index 9f2b5a31..02161524 100644
--- a/atvc/include/broadcast/utils/broadcast_buf_pool.h
+++ b/atvc/include/broadcast/utils/broadcast_buf_pool.h
@@ -36,7 +36,8 @@ public:
     __aicore__ inline BroadcastBufPool() {};
 
     template <typename T>
-    __aicore__ inline void Init(int32_t inputNum,    // The number of inputs required for doublebuff
+    __aicore__ inline void Init(AscendC::TPipe *pipeIn,
+        int32_t inputNum,    // The number of inputs required for doublebuff
         int32_t computeNum,  // The number of calculation results is generally consistent with inputNum
         int32_t inBlockLen,  // Basic input block size for one calculation
         int32_t outBlockLen)
@@ -51,6 +52,7 @@ public:
             inputNum *= ATVC::CONST2;
             computeNum *= ATVC::CONST2;
         }
+        pipe_ = pipeIn;
         constexpr int32_t eleSize = static_cast<int32_t>(sizeof(T));
         inputNum_ = inBlockLen / eleSize;
         outputNum_ = outBlockLen / eleSize;
@@ -62,7 +64,7 @@ public:
         computeUnit_.eleSize = eleSize;
         computeUnit_.offset = inBlockLen * inputNum;
         // Init buffer
-        GetTPipePtr()->InitBuffer(qQue_, poolSize);
+        pipe_->InitBuffer(qQue_, poolSize);
     }
 
     template <bool IsInput, typename T>
@@ -90,7 +92,7 @@ public:
     __aicore__ inline const void SetVecSync(AscendC::LocalTensor<T> &tensor)
     {
         uint32_t idx = GetInputTensorIdx<T>(tensor);
-        event_t eventId = static_cast<event_t>(GetTPipePtr()->AllocEventID<EVENT>());
+        event_t eventId = static_cast<event_t>(pipe_->AllocEventID<EVENT>());
         vecEventId_[idx] = eventId;
         AscendC::SetFlag<EVENT>(eventId);
     }
@@ -100,14 +102,14 @@ public:
     {
         uint32_t idx = GetInputTensorIdx<T>(tensor);
         AscendC::WaitFlag<EVENT>(vecEventId_[idx]);
-        GetTPipePtr()->ReleaseEventID<EVENT>(vecEventId_[idx]);
+        pipe_->ReleaseEventID<EVENT>(vecEventId_[idx]);
     }
 
     template <typename T, AscendC::HardEvent EVENT>
     __aicore__ inline const void SetCopyOutSync(AscendC::LocalTensor<T> &tensor)
     {
         uint32_t idx = GetOutputTensorIdx<T>(tensor);
-        event_t eventId = static_cast<event_t>(GetTPipePtr()->AllocEventID<EVENT>());
+        event_t eventId = static_cast<event_t>(pipe_->AllocEventID<EVENT>());
         outEventId_[idx] = eventId;
         AscendC::SetFlag<EVENT>(eventId);
     }
@@ -117,7 +119,7 @@ public:
     {
         uint32_t idx = GetOutputTensorIdx<T>(tensor);
         AscendC::WaitFlag<EVENT>(outEventId_[idx]);
-        GetTPipePtr()->ReleaseEventID<EVENT>(outEventId_[idx]);
+        pipe_->ReleaseEventID<EVENT>(outEventId_[idx]);
     }
 
     template <typename T>
@@ -140,7 +142,7 @@ public:
 
     __aicore__ inline const void ResetEvent()
     {
-        GetTPipePtr()->Reset();
+        pipe_->Reset();
     }
 
 private:
@@ -174,6 +176,7 @@ private:
     event_t outEventId_[MAX_INPUT_SIZE];
     bool isBusyOut_[MAX_INPUT_SIZE] = {false};
     AscendC::TBuf<> qQue_;
+    AscendC::TPipe *pipe_;
     int32_t inputNum_;
     int32_t outputNum_;
 };
diff --git a/atvc/include/broadcast/utils/broadcast_util.h b/atvc/include/broadcast/utils/broadcast_util.h
index 24cd745a..e220c3c1 100644
--- a/atvc/include/broadcast/utils/broadcast_util.h
+++ b/atvc/include/broadcast/utils/broadcast_util.h
@@ -24,32 +24,32 @@ namespace KernelUtils {
 template <typename T, const auto& SelectBroadcastPolicy>
 __aicore__ inline void PrintParam(const T* param)
 {
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: A0 = %lu\n", param->tilingData.A0);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: A11 = %lu\n", param->tilingData.A11);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: A12 = %lu\n", param->tilingData.A12);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: A2 = %lu\n", param->tilingData.A2);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: B0 = %lu\n", param->tilingData.B0);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: B1 = %lu\n", param->tilingData.B1);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: B2 = %lu\n", param->tilingData.B2);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: coreNum = %d\n", param->tilingData.coreNum);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: basicBlock = %lu\n", param->tilingData.basicBlock);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: factorACntPerCore = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: A0 = %lu\n", param->tilingData.A0);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: A11 = %lu\n", param->tilingData.A11);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: A12 = %lu\n", param->tilingData.A12);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: A2 = %lu\n", param->tilingData.A2);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: B0 = %lu\n", param->tilingData.B0);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: B1 = %lu\n", param->tilingData.B1);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: B2 = %lu\n", param->tilingData.B2);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: coreNum = %d\n", param->tilingData.coreNum);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: basicBlock = %lu\n", param->tilingData.basicBlock);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: factorACntPerCore = %lu\n",
                               param->tilingData.factorACntPerCore);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: factorATotalCnt = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: factorATotalCnt = %lu\n",
                               param->tilingData.factorATotalCnt);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: factorBCntPerCore = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: factorBCntPerCore = %lu\n",
                               param->tilingData.factorBCntPerCore);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: factorBTotalCnt = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: factorBTotalCnt = %lu\n",
                               param->tilingData.factorBTotalCnt);
     for (int32_t i = 0; i < ATVC::CONST2; i++) {
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: shape[%d] = %lu\n",
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: shape[%d] = %lu\n",
                                   i, param->tilingData.shape[i]);
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: dstShape[%d] = %lu\n",
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: dstShape[%d] = %lu\n",
                                   i, param->tilingData.dstShape[i]);
     }
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: policy.patternID = %d\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: policy.patternID = %d\n",
                               SelectBroadcastPolicy.patternID);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast] Tiling data: workspaceSize = %u\n", param->workspaceSize);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast] Tiling data: workspaceSize = %u\n", param->workspaceSize);
     return;
 }
 
diff --git a/atvc/include/common/atvc_op_check.h b/atvc/include/common/atvc_op_check.h
index 095e6b15..b02778cf 100644
--- a/atvc/include/common/atvc_op_check.h
+++ b/atvc/include/common/atvc_op_check.h
@@ -63,8 +63,7 @@ bool DebugCheck() {
     if constexpr (templateType == ATVC::TemplateType::REDUCE ||
                   templateType == ATVC::TemplateType::BROADCAST) {
         if (!CheckSameDtype_<Inputs, Outputs>()) {
-            printf("[ERROR]: [ATVC][OpTraits] Different input/output data types is not support "
-                "in Reduce or Broadcast template.\n");
+            printf("[ERROR]: [ATVC][OpTraits] Different input/output data types is not support in Reduce or Broadcast template.\n");
             return false;
         }
     }
diff --git a/atvc/include/common/compile_info.h b/atvc/include/common/compile_info.h
index 431531ac..2ffa1504 100644
--- a/atvc/include/common/compile_info.h
+++ b/atvc/include/common/compile_info.h
@@ -33,7 +33,7 @@ inline OpCompileInfo GetOpCompileInfo()
 {
     const auto& platformInfoMgr = platform_ascendc::PlatformAscendCManager::GetInstance();
     if (platformInfoMgr == nullptr) {
-        printf("[ERROR] failed to retrieve platform infomation.\n");
+        printf("[ERROR]: [ATVC][Common] Failed to retrieve platform infomation.\n");
         return {0, 0, 0, 0};
     }
     auto soc = platformInfoMgr->GetSocVersion();
diff --git a/atvc/include/common/kernel_utils.h b/atvc/include/common/kernel_utils.h
index be910f66..6aa2d977 100644
--- a/atvc/include/common/kernel_utils.h
+++ b/atvc/include/common/kernel_utils.h
@@ -15,6 +15,11 @@
 #include "common/const_def.h"
 #include "kernel_operator.h"
 namespace ATVC {
+#ifndef __ASCC_HOST__
+#ifndef __NPU_HOST__
+__BLOCK_LOCAL__ static AscendC::TPipe g_pipe;
+#endif // __NPU_HOST__
+#endif // __ASCC_HOST__
 template <AscendC::HardEvent EVENT>
 __aicore__ inline void SetEvent(AscendC::HardEvent evt)
 {
diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h
index 819caac1..964bbd6f 100644
--- a/atvc/include/elewise/elewise_op_template.h
+++ b/atvc/include/elewise/elewise_op_template.h
@@ -60,12 +60,12 @@ public:
     template <typename... Args>
     __aicore__ inline void Run(Args&&... args)
     {
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Start to run template function.\n");
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Start to run template function.\n");
+        g_pipe.Reset();
         constexpr std::size_t GM_ARGS_COUNT = INPUT_COUNT + OUTPUT_COUNT;
         GM_ADDR argsArr[INPUT_COUNT + OUTPUT_COUNT];
         InitHelper<0>(argsArr, ATVC::Forward<Args>(args)...);
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] End to run template function.\n");
-        pipeIn.Destroy();
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Template function execution completed.\n");
     }
 
 private:
@@ -108,9 +108,9 @@ private:
                 return;
             }
             Init();
-            ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] End to initialize template.\n");
+            ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Initialize template execution completed.\n");
             Process(ATVC::Forward<Ts>(ts)...);
-            ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] End main process.\n");
+            ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Main process execution completed.\n");
         }
     }
 
@@ -130,13 +130,13 @@ private:
         // Each in/out/temp uses a pipe for management,
         // and each pipe manages multiple tensors with consecutive sub addresses
         if constexpr (INPUT_COUNT > 0) {
-            GetTPipePtr()->InitBuffer(inQueue, param_->nBufferNum, param_->tilingData.tiledCnt * IN_TENSOR_SUM_BYTES);
+            g_pipe.InitBuffer(inQueue, param_->nBufferNum, param_->tilingData.tiledCnt * IN_TENSOR_SUM_BYTES);
         }
         if constexpr (OUTPUT_COUNT > 0) {
-            GetTPipePtr()->InitBuffer(outQueue, param_->nBufferNum, param_->tilingData.tiledCnt * OUT_TENSOR_SUM_BYTES);
+            g_pipe.InitBuffer(outQueue, param_->nBufferNum, param_->tilingData.tiledCnt * OUT_TENSOR_SUM_BYTES);
         }
         if constexpr(TEMP_COUNT > 0) {
-            GetTPipePtr()->InitBuffer(tempQueue, param_->tilingData.tiledCnt * TEMP_TENSOR_SUM_BYTES);
+            g_pipe.InitBuffer(tempQueue, param_->tilingData.tiledCnt * TEMP_TENSOR_SUM_BYTES);
         }
     }
     // Call CopyIn/CopyOut based on the tiling loop, as well as externally passed Compute calculations
@@ -187,7 +187,7 @@ private:
         constexpr uint32_t typeAlignCnt = UB_ALIGN_32 / sizeof(DataType);
         uint32_t alignMainCnt = caclCnt_ / typeAlignCnt * typeAlignCnt;
         uint32_t alignTailCnt = caclCnt_ - alignMainCnt;
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n",
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n",
                                   curCoreStartCnt_ + offsetCnt_, caclCnt_);
         if (alignMainCnt > 0) {
             AscendC::DataCopy(inLocalI, tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_], alignMainCnt);
@@ -217,7 +217,7 @@ private:
     __aicore__ inline void CopyIn(InTuple& inTensors, ATVC::IndexSequence<Is...>)
     {
         if constexpr (INPUT_COUNT == 0) {
-            ATVC::Kernel::DebugPrintf("[ERROR]:[ATVC][EleWise] Input Count can not be 0!\n");
+            ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n");
             return;
         }
         AscendC::LocalTensor<uint8_t> inLocal = inQueue.template AllocTensor<uint8_t>();
@@ -238,7 +238,7 @@ private:
         constexpr uint32_t TYPE_ALIGN_CNT = 32 / sizeof(DataType);
         uint32_t alignMainCnt = caclCnt_ / TYPE_ALIGN_CNT * TYPE_ALIGN_CNT;
         uint32_t alignTailCnt = caclCnt_ - alignMainCnt;
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise][CopyOut] Offset is %u, copy count is %u.\n",
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyOut] Offset is %u, copy count is %u.\n",
                                   curCoreStartCnt_ + offsetCnt_, caclCnt_);
         if (alignMainCnt > 0) {
             AscendC::DataCopy(tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_], outLocalI, alignMainCnt);
@@ -379,7 +379,6 @@ private:
 
     // The calculation object passed in by user
     EleWiseCompute compute_;
-    AscendC::TPipe pipeIn;
 };
 }
 }
diff --git a/atvc/include/elewise/utils/elewise_util.h b/atvc/include/elewise/utils/elewise_util.h
index f5154281..2b4cd2c0 100644
--- a/atvc/include/elewise/utils/elewise_util.h
+++ b/atvc/include/elewise/utils/elewise_util.h
@@ -24,14 +24,14 @@ namespace KernelUtils {
 template <typename T>
 __aicore__ inline void PrintParam(const T *param)
 {
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: blockNum = %u\n", param->tilingData.blockNum);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: tiledCnt = %u\n", param->tilingData.tiledCnt);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Tiling data: blockNum = %u\n", param->tilingData.blockNum);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Tiling data: tiledCnt = %u\n", param->tilingData.tiledCnt);
     ATVC::Kernel::DebugPrintf(
-        "[INFO]:[ATVC][EleWise] Tiling data: tailBlockCnt = %u\n", param->tilingData.tailBlockCnt);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: numPerBlock = %u\n", param->tilingData.numPerBlock);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Tiling data: tailElemCnt = %u\n", param->tilingData.tailElemCnt);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Param: nBufferNum = %u\n", param->nBufferNum);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise] Param: totalCnt = %u\n", param->totalCnt);
+        "[INFO]: [ATVC][EleWise] Tiling data: tailBlockCnt = %u\n", param->tilingData.tailBlockCnt);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Tiling data: numPerBlock = %u\n", param->tilingData.numPerBlock);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Tiling data: tailElemCnt = %u\n", param->tilingData.tailElemCnt);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Param: nBufferNum = %u\n", param->nBufferNum);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise] Param: totalCnt = %u\n", param->totalCnt);
     return;
 }
 
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index 3eca4811..fe65d4dd 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -73,7 +73,7 @@ public:
     template<typename ...Args>
     __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param)
     {
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Start to run template function.\n");
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Start to run template function.\n");
         param_ = param;  
         KernelUtils::PrintParam<ReduceParam, SelectReducePolicy>(param_);
         if (!KernelUtils::CheckParam<ReduceParam>(param_)) {
@@ -81,8 +81,7 @@ public:
         }
         Init((GM_ADDR)(param_->workspaceAddr), x, y);  
         Process();
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] End to run template function.\n");
-        pipeIn.Destroy();
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Template function execution completed.\n");
     }
 
 public:
@@ -94,18 +93,19 @@ public:
     template <class... Args>
     __aicore__ inline void Init(GM_ADDR workspace, Args... args)
     {
+        pipe_ = GetTPipePtr();
         basicBlockLen_ = this->param_->tilingData.basicBlock;
-        bufPool_.template Init<DataType, PromoteDataType>(T_BUF_SIZE, PROMOTE_BUF_SIZE, this->param_->tilingData.basicBlock);
+        bufPool_.template Init<DataType, PromoteDataType>(pipe_, T_BUF_SIZE, PROMOTE_BUF_SIZE, this->param_->tilingData.basicBlock);
 
         InitArgsInput<0>(args...);
         InitArgsWorkspace(workspace);
-        GetTPipePtr()->InitBuffer(tempResQue_, RES_BUF_SIZE);
+        pipe_->InitBuffer(tempResQue_, RES_BUF_SIZE);
         computeRes_ = tempResQue_.Get<PromoteDataType>();
-        GetTPipePtr()->InitBuffer(tempBufQue_, CACHE_BUF_SIZE);
+        pipe_->InitBuffer(tempBufQue_, CACHE_BUF_SIZE);
 
         tempBuf_ = tempBufQue_.template Get<PromoteDataType>();
 
-        GetTPipePtr()->InitBuffer(tempUbQue_, BLOCK_SIZE_BYTE);
+        pipe_->InitBuffer(tempUbQue_, BLOCK_SIZE_BYTE);
     }
 
     /*!
@@ -227,7 +227,7 @@ public:
         copyInParams.srcStride = (view.axis[0].srcStride - view.burstLen) * sizeof(EleT);  // unit Byte
         copyInParams.dstStride =
             (view.axis[0].dstStride - view.burstLen) * sizeof(EleT) / BLOCK_SIZE_BYTE;  // unit block(32byte)
-            ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce][CopyIn] Padding is %d, padding value is %d,  block length is %u,"
+            ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce][CopyIn] Padding flag is %d, padding value is %d,  block length is %u,"
             " repeat count is %u.\n", isPadding, paddingValue, copyInParams.blockLen, copyInParams.blockCount);
         bufPool_.SyncTensor(ubTensor);
 
@@ -313,7 +313,7 @@ public:
         AscendC::DataCopyExtParams copyOutParams = {1, 1, 0, 0, 0};
         copyOutParams.blockCount = view.axis[0].repeat;
         copyOutParams.blockLen = view.burstLen * sizeof(T);
-        ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce][CopyOut] Block length is %u,  repeat count is %u.\n",
+        ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce][CopyOut] Block length is %u, repeat count is %u.\n",
                                   view.burstLen, copyOutParams.blockCount);
         AscendC::LocalTensor<DataType> outputLocal = tempBuf_[tmpBufOffest].template ReinterpretCast<DataType>();
         if constexpr (AscendC::IsSameType<PromoteDataType, DataType>::value) {
@@ -371,7 +371,7 @@ protected:
 
 private:
     ATVC::ReduceParam* param_;   // The runtime parameters calculated by CalcReduceTiling API
-    AscendC::TPipe pipeIn;
+    AscendC::TPipe* pipe_;
     AscendC::TBuf<> oriVecQue_;
     AscendC::TBuf<> tempResQue_;
     AscendC::TBuf<> tempBufQue_;
diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h
index bcded4d2..7d42c023 100644
--- a/atvc/include/reduce/tiling/reduce_tiling.h
+++ b/atvc/include/reduce/tiling/reduce_tiling.h
@@ -58,11 +58,11 @@ public:
         EliminateOne(opInput_.reduceShape, opInput_.reduceDim, newShape, newShapeSize);
         MergeAxis(opInput_.reduceDim, newShape, newShapeSize);
         if (!DoTiling(newShape, newShapeSize)) {
-            printf("Do tiling failed!\n");
+            printf("[ERROR]: [ATVC][Reduce][Tiling] Do tiling failed!\n");
             return -1;
         }
         CalcWorkSpace();
-        printf("ReduceOpTiling Run success!\n");
+        printf("[INFO]: [ATVC][Reduce][Tiling] ReduceOpTiling Run success!\n");
         return 0;
     }
 
@@ -225,7 +225,7 @@ bool ComputeExtraUnitA(const std::vector<uint64_t>& shape)
     uint64_t dTypeSize = ge::GetSizeByDataType(opInput_.inputDtype);
     uint64_t promoteDtypeSize = ge::GetSizeByDataType(opInput_.promoteDtpye);
     if (dTypeSize == 0 || promoteDtypeSize == 0) {
-        printf("[Reduce Tiling] input dtype size cannot be zero!\n");
+        printf("[ERROR]: [ATVC][Reduce][Tiling] Input dtype size cannot be zero!\n");
         return false;
     }
     uint64_t bBlockNum = basicBlock_ / dTypeSize;
@@ -391,7 +391,7 @@ bool CalcCacheLineStep(const std::vector<uint64_t>& shape)
     // cacheLineStep record cacheLine-aligned axis's shape, while left is cacheLineOuter
     uint64_t dTypeSize = ge::GetSizeByDataType(opInput_.inputDtype);
     if (dTypeSize == 0) {
-        printf("[Reduce Tiling] input dtype size cannot be zero!\n");
+        printf("[ERROR]: [ATVC][Reduce][Tiling] Input dtype size cannot be zero!\n");
         return false;
     }
     uint64_t cacheSize = compileInfo_.cacheLineSize / dTypeSize;
diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h
index eeba4768..e6779444 100644
--- a/atvc/include/reduce/utils/reduce_buf_pool.h
+++ b/atvc/include/reduce/utils/reduce_buf_pool.h
@@ -37,8 +37,9 @@ public:
     __aicore__ inline ReduceBufPool() {};
 
     template <class DataType, class PromoteDataType>
-    __aicore__ inline void Init(int32_t inputNum, int32_t computeNum, int32_t basicBlockLen)
+    __aicore__ inline void Init(AscendC::TPipe* pipeIn, int32_t inputNum, int32_t computeNum, int32_t basicBlockLen)
     {
+        pipe_ = pipeIn;
         constexpr int32_t inputEleSize = sizeof(DataType);
         constexpr int32_t computeEleSize = sizeof(PromoteDataType);
         basicNum_ = basicBlockLen / sizeof(DataType);
@@ -50,7 +51,7 @@ public:
         computeUnit_.eleSize = computeEleSize;
         computeUnit_.offset = basicNum_ * sizeof(DataType) * inputNum;
         // Init buffer
-        GetTPipePtr()->InitBuffer(qQue_, poolSize);
+        pipe_->InitBuffer(qQue_, poolSize);
         AscendC::LocalTensor<DataType> inputUb = qQue_.GetWithOffset<DataType>(basicNum_ * inputNum, 0);
         AscendC::Duplicate<DataType>(inputUb, 0, basicNum_ * inputNum);
     }
@@ -123,6 +124,7 @@ private:
     PoolManagerUnit computeUnit_;
     event_t eventIdV2Mte2_[MAX_INPUT_SIZE];
     AscendC::TBuf<> qQue_;
+    AscendC::TPipe* pipe_;
     int32_t basicNum_;
 };  // class ReduceBufPool
 } // namespace KernelUtils
diff --git a/atvc/include/reduce/utils/reduce_util.h b/atvc/include/reduce/utils/reduce_util.h
index 213fb6bc..f8832aae 100644
--- a/atvc/include/reduce/utils/reduce_util.h
+++ b/atvc/include/reduce/utils/reduce_util.h
@@ -51,24 +51,24 @@ __aicore__ inline constexpr int32_t GetComputeCount()
 template <typename T, const auto& SelectReducePolicy>
 __aicore__ inline void PrintParam(const T* param)
 {
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: factorACntPerCore = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: factorACntPerCore = %lu\n",
         param->tilingData.factorACntPerCore);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: factorATotalCnt = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: factorATotalCnt = %lu\n",
         param->tilingData.factorATotalCnt);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: ubFactorA = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: ubFactorA = %lu\n",
         param->tilingData.ubFactorA);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: factorRCntPerCore = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: factorRCntPerCore = %lu\n",
         param->tilingData.factorRCntPerCore);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: factorRTotalCnt = %lu\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: factorRTotalCnt = %lu\n",
         param->tilingData.factorRTotalCnt);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: ubFactorR = %lu\n", param->tilingData.ubFactorR);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: groupR = %lu\n", param->tilingData.groupR);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: outSize = %lu\n", param->tilingData.outSize);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: basicBlock = %lu\n", param->tilingData.basicBlock);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: coreNum = %d\n", param->tilingData.coreNum);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Tiling data: nBufferNum = %d\n", param->nBufferNum);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Work space size is %u\n", param->workspaceSize);
-    ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] policy = (%d, %d, %d)\n",
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: ubFactorR = %lu\n", param->tilingData.ubFactorR);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: groupR = %lu\n", param->tilingData.groupR);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: outSize = %lu\n", param->tilingData.outSize);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: basicBlock = %lu\n", param->tilingData.basicBlock);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: coreNum = %d\n", param->tilingData.coreNum);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Tiling data: nBufferNum = %d\n", param->nBufferNum);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] Work space size is %u\n", param->workspaceSize);
+    ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Reduce] policy = (%d, %d, %d)\n",
         SelectReducePolicy.patternID, SelectReducePolicy.loopARCount, SelectReducePolicy.loopInnerARCount);
     return;
 }
-- 
Gitee