diff --git a/examples/filter/dropout/CMakeLists.txt b/examples/filter/dropout/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..00cc3f66543121cadf0f897998346c6c773ac2d3
--- /dev/null
+++ b/examples/filter/dropout/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/dropout_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(dropout_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/dropout_custom_tiling.cpp
+)
+
+target_compile_options(dropout_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(dropout_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(dropout_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(dropout_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS dropout_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/README.md b/examples/filter/dropout/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a4aa0cf593bab3bd3d7d2b15048b32995039967
--- /dev/null
+++ b/examples/filter/dropout/README.md
@@ -0,0 +1,93 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用Dropout高阶API实现dropout算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于Dropout高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 数据过滤 > Dropout" 章节。
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas 推理系列产品AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+
+  DropOutCustom算子根据输入MaskTensor对输入SrcTensor进行过滤得到输出DstTensor，本算子仅支持输入shape为ND格式。DropOut API支持的过滤功能包括两种模式，字节模式和比特模式，本算子样例使用DropOut的字节模式。
+
+- 算子规格：
+
+  <table>
+  <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">DropoutCustom</td></tr>
+
+  <tr><td rowspan="4" align="center">算子输入</td></tr>
+  <tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+  <tr><td align="center">x</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+  <tr><td align="center">mask</td><td align="center">8*2048</td><td align="center">uint8_t</td><td align="center">ND</td></tr>
+  <tr><td rowspan="2" align="center">算子输出</td></tr>
+  <tr><td align="center">y</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+  <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">dropout_custom</td></tr>
+  </table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[8, 2048]，输出y[8, 2048]的DropoutCustom算子。
+
+- Kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Dropout高阶API接口完成dropout计算，得到最终结果，再搬出到外部存储上。
+
+  DropoutCustom算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行dropout计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- Tiling实现
+
+  根据输入数据x、mask的内轴、外轴长度和所用核数量确定所需tiling参数，并调用GetDropOutMaxMinTmpSize接口获取DropOut计算所需的临时空间大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+      - Atlas 推理系列产品AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/filter/dropout/cmake/cpu_lib.cmake b/examples/filter/dropout/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f
--- /dev/null
+++ b/examples/filter/dropout/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/cmake/npu_lib.cmake b/examples/filter/dropout/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964
--- /dev/null
+++ b/examples/filter/dropout/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/main.cpp b/examples/filter/dropout/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b255de2ebe75478897e540ce269b280ca1f06f3
--- /dev/null
+++ b/examples/filter/dropout/main.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "./op_host/dropout_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_dropout_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm,
+                                                     DropoutCustomTilingData tiling);
+#endif
+
+namespace {
+constexpr uint32_t USED_CORE_NUM = 8;
+constexpr uint32_t TILINGDATA_SIZE = 5;
+constexpr uint32_t FIRST_AXIS = 8;
+constexpr uint32_t SRC_LAST_AXIS = 2048;
+}
+
+extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis,
+                               const uint32_t maskLastAxis, const uint32_t coreNum);
+
+static bool CompareResult(const void *outputData, uint32_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (size_t i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        printf("wrongNum: %ld\n", wrongNum);
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+    size_t inputSize = FIRST_AXIS * SRC_LAST_AXIS * sizeof(uint32_t);
+    size_t maskSize = FIRST_AXIS * SRC_LAST_AXIS;
+    size_t outputSize = inputSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *mask = (uint8_t *)AscendC::GmAlloc(maskSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+    ReadFile("../input/input_mask.bin", maskSize, mask, maskSize);
+
+    GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(dropout_custom, USED_CORE_NUM, x, y, mask,
+                *reinterpret_cast<DropoutCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)mask);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *maskHost, *tiling;
+    uint8_t *xDevice, *yDevice, *maskDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&maskHost), maskSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&maskDevice, maskSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+    ReadFile("../input/input_mask.bin", maskSize, maskHost, maskSize);
+
+    GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(maskDevice, maskSize, maskHost, maskSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(dropout_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, maskDevice, reinterpret_cast<DropoutCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(maskDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(maskHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.cpp b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c668709dc5b8160363d599746c034b2dc208cb6d
--- /dev/null
+++ b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "dropout_custom_tiling.h"
+#include "tiling/tiling_api.h"
+#include <vector>
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis,
+                        const uint32_t maskLastAxis, const uint32_t coreNum) {
+    std::vector<int64_t> srcDims = {firstAxis, srcLastAxis, maskLastAxis};
+
+    uint32_t typeSize = 4;
+    ge::Shape shape(srcDims);
+    uint32_t minValue = 0;
+    uint32_t maxValue = 0;
+
+    AscendC::GetDropOutMaxMinTmpSize(shape, typeSize, false, maxValue, minValue);
+
+    DropoutCustomTilingData *tiling = reinterpret_cast<DropoutCustomTilingData *>(tilingBuf);
+
+    tiling->firstAxis = firstAxis;
+    tiling->srcLastAxis = srcLastAxis;
+    tiling->maskLastAxis = maskLastAxis;
+
+    tiling->tmpBufferSize = minValue;
+    tiling->tileNum = coreNum;
+}
\ No newline at end of file
diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.h b/examples/filter/dropout/op_host/dropout_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddf7470c8ff5dad3b8c240a99ce1092ff1329ca6
--- /dev/null
+++ b/examples/filter/dropout/op_host/dropout_custom_tiling.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
+#define EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
+
+#include <cstdint>
+
+struct DropoutCustomTilingData {
+    uint32_t firstAxis;
+    uint32_t srcLastAxis;
+    uint32_t maskLastAxis;
+    uint32_t tmpBufferSize;
+    uint32_t tileNum;
+};
+
+#endif // EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/filter/dropout/op_kernel/dropout_custom.cpp b/examples/filter/dropout/op_kernel/dropout_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bae3d52cacf9e97400efd7a3bb0b9758ffa45f7
--- /dev/null
+++ b/examples/filter/dropout/op_kernel/dropout_custom.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "dropout_custom_impl.h"
+#include "kernel_operator.h"
+
+extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm,
+                                                     DropoutCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    AscendC::TPipe pipe;
+    MyCustomKernel::KernelDropout<float> op;
+    op.Init(srcGm, dstGm, maskGm, tiling, &pipe);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/filter/dropout/op_kernel/dropout_custom_impl.h b/examples/filter/dropout/op_kernel/dropout_custom_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..18541c2430be3848390b87764c0a4768034de9e5
--- /dev/null
+++ b/examples/filter/dropout/op_kernel/dropout_custom_impl.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_IMPL_H
+#define EXAMPLES_FILTER_DROPOUT_CUSTOM_IMPL_H
+
+#include "../op_host/dropout_custom_tiling.h"
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelDropout {
+public:
+    __aicore__ inline KernelDropout() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR m, DropoutCustomTilingData tilingData, AscendC::TPipe *pipeIn) {
+        ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); });
+        srcSize = tilingData.firstAxis * tilingData.srcLastAxis;
+        tileNum = tilingData.tileNum;
+        blockLength = srcSize / AscendC::GetBlockNum();
+        tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        info.firstAxis = tilingData.firstAxis / tilingData.tileNum;
+        info.srcLastAxis = tileLength;
+        info.maskLastAxis = tileLength;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        maskGm.SetGlobalBuffer((__gm__ uint8_t *)m + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+
+        pipe = pipeIn;
+        pipe->InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe->InitBuffer(inQueueMask, BUFFER_NUM, this->tileLength * sizeof(uint8_t));
+        pipe->InitBuffer(outQueue, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe->InitBuffer(tmpBuf, tilingData.tmpBufferSize * sizeof(uint8_t));
+    }
+    __aicore__ inline void Process() {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> maskLocal = inQueueMask.AllocTensor<uint8_t>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(maskLocal, maskGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueMask.EnQue(maskLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<uint8_t> maskLocal = inQueueMask.DeQue<uint8_t>();
+
+        AscendC::LocalTensor<T> yLocal = outQueue.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpBuf.AllocTensor<uint8_t>();
+
+        AscendC::DropOut(yLocal, xLocal, maskLocal, sharedTmpBuffer, probValue, info);
+
+        outQueue.EnQue<T>(yLocal);
+
+        inQueueX.FreeTensor(xLocal);
+        inQueueMask.FreeTensor(maskLocal);
+        tmpBuf.FreeTensor(sharedTmpBuffer);
+    }
+    __aicore__ inline void CopyOut(int32_t progress) {
+        AscendC::LocalTensor<T> yLocal = outQueue.DeQue<T>();
+        AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
+        outQueue.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+    AscendC::GlobalTensor<uint8_t> maskGm;
+
+    AscendC::TPipe *pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueMask;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueue;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tmpBuf;
+
+    uint32_t srcSize = 0;
+    uint32_t tileNum = 0;
+    uint32_t blockLength = 0;
+    uint32_t tileLength = 0;
+    float probValue = 0.8;
+    AscendC::DropOutShapeInfo info;
+};
+}
+
+#endif
\ No newline at end of file
diff --git a/examples/filter/dropout/run.sh b/examples/filter/dropout/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..76ad17bbb6186bd6806c92e608f2a0a9203e40d5
--- /dev/null
+++ b/examples/filter/dropout/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./dropout_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./dropout_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./dropout_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/filter/dropout/scripts/gen_data.py b/examples/filter/dropout/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..22b92a1c5e84822486d7e0a5502b432b3621a331
--- /dev/null
+++ b/examples/filter/dropout/scripts/gen_data.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [8, 2048]
+
+    input_x = np.random.uniform(1, 100, input_shape).astype(dtype)
+    input_mask = np.random.randint(0, 2, size=input_shape, dtype=bool)
+    golden = np.where(input_mask, input_x / 0.8, np.zeros_like(input_x))
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    input_mask.tofile("./input/input_mask.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
\ No newline at end of file
diff --git a/examples/math/cos/CMakeLists.txt b/examples/math/cos/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..188a58818efc58f6a78f6c0bdc893047f1fe4e79
--- /dev/null
+++ b/examples/math/cos/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/cos_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(cos_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/cos_custom_tiling.cpp
+)
+
+target_compile_options(cos_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(cos_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(cos_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(cos_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS cos_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/math/cos/README.md b/examples/math/cos/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..303f5bba4bf7c330c4b0d36272f9942212493737
--- /dev/null
+++ b/examples/math/cos/README.md
@@ -0,0 +1,92 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用Cos高阶API实现cos算子，并按照核函数直调的方式分别给出了对应的端到端实现。关于Cos高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 数学库 > Cos" 章节
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas 推理系列产品AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+
+  CosCustom算子对输入tensor按元素做三角函数余弦运算。
+
+- 算子规格：
+
+  <table>
+  <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">CosCustom</td></tr>
+
+  <tr><td rowspan="3" align="center">算子输入</td></tr>
+  <tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+  <tr><td align="center">x</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+  <tr><td rowspan="2" align="center">算子输出</td></tr>
+  <tr><td align="center">y</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+  <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">cos_custom</td></tr>
+  </table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[8, 2048]，输出y[8, 2048]的CosCustom算子。
+
+- Kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Cos高阶API接口完成cos计算，得到最终结果，再搬出到外部存储上。
+
+  CosCustom算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行cos计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- Tiling实现
+
+  根据输入长度和所用核数量设置tiling参数totalLength和coreNum。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+      - Atlas 推理系列产品AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/math/cos/cmake/cpu_lib.cmake b/examples/math/cos/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f
--- /dev/null
+++ b/examples/math/cos/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/math/cos/cmake/npu_lib.cmake b/examples/math/cos/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964
--- /dev/null
+++ b/examples/math/cos/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/math/cos/main.cpp b/examples/math/cos/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c1c704e0c0e5b9451d1278c9a40570e58121dbb
--- /dev/null
+++ b/examples/math/cos/main.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "op_host/cos_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_cos_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling);
+#endif
+
+namespace {
+constexpr uint32_t USED_CORE_NUM = 8;
+constexpr uint32_t TILINGDATA_SIZE = 2;
+constexpr uint32_t TOTAL_LENGTH = 8 * 2048;
+}
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum);
+
+static bool CompareResult(const void *outputData, uint32_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (size_t i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+    size_t inputSize = TOTAL_LENGTH * sizeof(uint32_t);
+    size_t outputSize = inputSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(cos_custom, USED_CORE_NUM, x, y,
+                *reinterpret_cast<CosCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *tiling;
+    uint8_t *xDevice, *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(cos_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast<CosCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/math/cos/op_host/cos_custom_tiling.cpp b/examples/math/cos/op_host/cos_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..352215a9931bc58122741c3568652754f1b84fed
--- /dev/null
+++ b/examples/math/cos/op_host/cos_custom_tiling.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "cos_custom_tiling.h"
+#include "tiling/tiling_api.h"
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum) {
+    CosCustomTilingData *tiling = reinterpret_cast<CosCustomTilingData *>(tilingBuf);
+
+    tiling->totalLength = totalLength;
+    tiling->tileNum = coreNum;
+}
\ No newline at end of file
diff --git a/examples/math/cos/op_host/cos_custom_tiling.h b/examples/math/cos/op_host/cos_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ba0775f5cf345748d164d26cf35c9b9adfd2f6d
--- /dev/null
+++ b/examples/math/cos/op_host/cos_custom_tiling.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATH_COS_CUSTOM_TILING_H
+#define EXAMPLES_MATH_COS_CUSTOM_TILING_H
+
+#include <cstdint>
+
+struct CosCustomTilingData {
+    uint32_t totalLength;
+    uint32_t tileNum;
+};
+
+#endif // EXAMPLES_MATH_COS_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/math/cos/op_kernel/cos_custom.cpp b/examples/math/cos/op_kernel/cos_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86e77d7eab7972ab8d278f82e655d6dd2c5641da
--- /dev/null
+++ b/examples/math/cos/op_kernel/cos_custom.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "cos_custom_impl.h"
+#include "kernel_operator.h"
+
+extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    AscendC::TPipe pipe;
+    MyCustomKernel::KernelCos<float> op;
+    op.Init(x, y, tiling, &pipe);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/math/cos/op_kernel/cos_custom_impl.h b/examples/math/cos/op_kernel/cos_custom_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..09ee14d7f26092a25ef7148a6244004389f03a38
--- /dev/null
+++ b/examples/math/cos/op_kernel/cos_custom_impl.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATH_COS_CUSTOM_IMPL_H
+#define EXAMPLES_MATH_COS_CUSTOM_IMPL_H
+#include "../op_host/cos_custom_tiling.h"
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelCos {
+public:
+    __aicore__ inline KernelCos() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, CosCustomTilingData tilingData, AscendC::TPipe *pipeIn) {
+        ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); });
+        this->tileNum = tilingData.tileNum;
+        this->blockLength = tilingData.totalLength / AscendC::GetBlockNum();
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+
+        pipe = pipeIn;
+        pipe->InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe->InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(T));
+    }
+    __aicore__ inline void Process() {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+        AscendC::Cos<T, false>(yLocal, xLocal, this->tileLength);
+        outQueueY.EnQue<T>(yLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress) {
+        AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+        AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
+        outQueueY.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+
+    uint32_t tileNum = 0;
+    uint32_t blockLength = 0;
+    uint32_t tileLength = 0;
+};
+} // namespace MyCustomKernel
+
+#endif
\ No newline at end of file
diff --git a/examples/math/cos/run.sh b/examples/math/cos/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2f8c76a41493ebf1ad8c5c31959b19e1d4cd4c2a
--- /dev/null
+++ b/examples/math/cos/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./cos_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./cos_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./cos_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/math/cos/scripts/gen_data.py b/examples/math/cos/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..c295292ee9d933a2b02b3ac1103042506d8fd336
--- /dev/null
+++ b/examples/math/cos/scripts/gen_data.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [8, 2048]
+    # generate value between [-65504, 65504]
+    input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype)
+    golden = np.cos(input_x).astype(dtype)
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/readme.md b/examples/readme.md
index 5af4dde79d76a75d8513887ad839e37851bcabb8..b3ebb211645bd3848a3dfb9f15082640bb987b65 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -146,6 +146,26 @@
         <td><a href="./normalization/welford_finalize"> welford_finalize </td>
         <td> Welford算法的后处理，一种在线计算均值和方差的方法。 </td>
     </tr>
+    <tr>
+        <th rowspan="1"><a href="./math"> math</th>
+        <td><a href="./math/cos"> cos</td>
+        <td> 对输入tensor做Cos计算。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./filter"> filter </th>
+        <td><a href="./filter/dropout"> dropout </td>
+        <td> 对输入tensor进行数据过滤。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./reduce"> reduce </th>
+        <td><a href="./reduce/sum"> sum </td>
+        <td> 计算输入tensor最后一个维度的元素之和。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./transpose"> transpose </th>
+        <td><a href="./transpose/confusion_transpose"> confusion_transpose </td>
+        <td> 对输入数据进行数据排布及Reshape操作。 </td>
+    </tr>
     <tr>
         <th rowspan="1"><a href="./pad"> pad</th>
         <td><a href="./pad/broadcast"> broadcast</td>
diff --git a/examples/reduce/sum/CMakeLists.txt b/examples/reduce/sum/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db2524fce36b508c5eed1a2e004b6a9a96b468f4
--- /dev/null
+++ b/examples/reduce/sum/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/sum_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(sum_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/sum_custom_tiling.cpp
+)
+
+target_compile_options(sum_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(sum_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(sum_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(sum_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS sum_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/README.md b/examples/reduce/sum/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9a492b2229c3862592ebc71d1b15aeb36a93c102
--- /dev/null
+++ b/examples/reduce/sum/README.md
@@ -0,0 +1,92 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用Sum高阶API实现sum算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于Sum高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 归约操作 > Sum" 章节。
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+- Atlas 推理系列产品 AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+
+  SumCustom算子，获取输入数据最后一个维度的元素总和。如果输入是向量，则对向量中各元素进行累加；如果输入是矩阵，则沿最后一个维度对每行中元素求和。
+
+- 算子规格：
+
+  <table>
+  <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">SumCustom</td></tr>
+
+  <tr><td rowspan="3" align="center">算子输入</td></tr>
+  <tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+  <tr><td align="center">x</td><td align="center">7*2023</td><td align="center">float</td><td align="center">ND</td></tr>
+  <tr><td rowspan="2" align="center">算子输出</td></tr>
+  <tr><td align="center">y</td><td align="center">8*1</td><td align="center">float</td><td align="center">ND</td></tr>
+
+  <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">sum_custom</td></tr>
+  </table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[7, 2023]，输出y[8]的SumCustom算子，其中y中的有效值数量为7，对输入x的每行元素求和后，输出y的有效数据为前7位，最后一位为padding填充的数据。
+
+- Kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Sum高阶API接口完成sum计算，得到最终结果，再搬出到外部存储上。
+
+  SumCustom算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行sum计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- Tiling实现
+
+  根据输入数据的内轴长度、内轴实际长度、外轴长度确定所需tiling参数，例如输出内轴补齐后长度等。调用GetSumMaxMinTmpSize接口获取Sum接口完成计算所需的临时空间大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+      - Atlas 推理系列产品 AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/reduce/sum/cmake/cpu_lib.cmake b/examples/reduce/sum/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f
--- /dev/null
+++ b/examples/reduce/sum/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/cmake/npu_lib.cmake b/examples/reduce/sum/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964
--- /dev/null
+++ b/examples/reduce/sum/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3ec5710ebf2f08f98f79c432de8b1fde70ac2b6
--- /dev/null
+++ b/examples/reduce/sum/main.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "./op_host/sum_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_sum_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling);
+#endif
+
+namespace {
+constexpr uint32_t USED_CORE_NUM = 1;
+constexpr uint32_t TILINGDATA_SIZE = 5;
+constexpr uint32_t M = 7;    // outter
+constexpr uint32_t N = 2023; // inner_actual
+}
+
+extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N);
+
+static bool CompareResult(const void *outputData, uint32_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (size_t i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        printf("wrongNum: %ld\n", wrongNum);
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    uint8_t *tiling = nullptr;
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost;
+    uint8_t *xDevice, *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+#endif
+    GenerateTilingData(tiling, M, N);
+
+    auto tilingData = reinterpret_cast<SumCustomTilingData *>(tiling);
+
+    size_t inputSize = tilingData->outter * tilingData->inner * sizeof(uint32_t);
+    size_t outputSize = tilingData->out_inner * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(sum_custom, USED_CORE_NUM, x, y,
+                *reinterpret_cast<SumCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(sum_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast<SumCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebfc06e6f61fb18690aeb1a76d85324a239a4c99
--- /dev/null
+++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "sum_custom_tiling.h"
+#include "tiling/tiling_api.h"
+#include <vector>
+
+namespace {
+constexpr uint32_t PADDING_BYTE = 32U;
+}
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) {
+    uint32_t minValue = 0;
+    uint32_t maxValue = 0;
+
+    AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue);
+
+    SumCustomTilingData *tiling = reinterpret_cast<SumCustomTilingData *>(tilingBuf);
+
+    auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t {
+        if (typeSize == 0) {
+            return 0;
+        }
+        return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize;
+    };
+
+    tiling->outter = M;
+    tiling->inner = paddingFunc(N, sizeof(uint32_t));
+    tiling->n = N;
+    tiling->tmpBufSize = minValue;
+
+    tiling->out_inner = paddingFunc(M, sizeof(uint32_t));
+}
\ No newline at end of file
diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..e39475357ed4ae7393d4fbf2ba72c0638f6ba4c1
--- /dev/null
+++ b/examples/reduce/sum/op_host/sum_custom_tiling.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
+#define EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
+
+#include <cstdint>
+
+struct SumCustomTilingData {
+    uint32_t inner;
+    uint32_t outter;
+    uint32_t n;
+    uint32_t tmpBufSize;
+    uint32_t out_inner;
+};
+
+#endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/reduce/sum/op_kernel/sum_custom.cpp b/examples/reduce/sum/op_kernel/sum_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d18e0e8b8edfcf1164c94c3e220e30947baa393
--- /dev/null
+++ b/examples/reduce/sum/op_kernel/sum_custom.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "sum_custom_impl.h"
+
+extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    AscendC::TPipe pipe;
+    MyCustomKernel::KernelSum<float> op;
+    op.Init(srcGm, dstGm, tiling, &pipe);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..71f8bef190a523191aeb76f7857400a2f882efdb
--- /dev/null
+++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_REDUCE_SUM_CUSTOM_IMPL_H
+#define EXAMPLES_REDUCE_SUM_CUSTOM_IMPL_H
+
+#include "../op_host/sum_custom_tiling.h"
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+template <typename T>
+class KernelSum {
+public:
+    __aicore__ inline KernelSum() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, SumCustomTilingData tilingData, AscendC::TPipe *pipeIn) {
+        ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); });
+        inner = tilingData.inner;
+        outter = tilingData.outter;
+        n = tilingData.n;
+        tmpBufSize = tilingData.tmpBufSize;
+        out_inner = tilingData.out_inner;
+
+        params.inner = inner;
+        params.outter = outter;
+        params.n = n;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x);
+        yGm.SetGlobalBuffer((__gm__ T *)y);
+
+        pipe = pipeIn;
+        pipe->InitBuffer(inQueue, 1, inner * outter * sizeof(T));
+        pipe->InitBuffer(outQueue, 1, out_inner * sizeof(T));
+        pipe->InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t));
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn() {
+        AscendC::LocalTensor<T> xLocal = inQueue.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm, inner * outter);
+        inQueue.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<T> xLocal = inQueue.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueue.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpBuf.AllocTensor<uint8_t>();
+
+        T scalar(0);
+        AscendC::Duplicate<T>(yLocal, scalar, out_inner);
+        AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params);
+
+        outQueue.EnQue<T>(yLocal);
+        inQueue.FreeTensor(xLocal);
+        tmpBuf.FreeTensor(sharedTmpBuffer);
+    }
+    __aicore__ inline void CopyOut() {
+        AscendC::LocalTensor<T> yLocal = outQueue.DeQue<T>();
+        AscendC::DataCopy(yGm, yLocal, out_inner);
+        outQueue.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tmpBuf;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+
+    uint32_t inner = 0;
+    uint32_t outter = 0;
+    uint32_t n = 0;
+    uint32_t tmpBufSize = 0;
+    uint32_t out_inner = 0;
+    AscendC::SumParams params;
+};
+}
+
+#endif
\ No newline at end of file
diff --git a/examples/reduce/sum/run.sh b/examples/reduce/sum/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..796c0fd787e97537762719509f4fcff26fd809c4
--- /dev/null
+++ b/examples/reduce/sum/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./sum_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./sum_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./sum_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/reduce/sum/scripts/gen_data.py b/examples/reduce/sum/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..9538198ba677e42cf8ae53dd835dc6634f97403b
--- /dev/null
+++ b/examples/reduce/sum/scripts/gen_data.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [7, 2024]
+    input_x = np.random.uniform(-100, 100, input_shape).astype(dtype)
+
+    golden = np.sum(input_x[:, :-1], axis=1).astype(dtype)
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/transpose/confusion_transpose/CMakeLists.txt b/examples/transpose/confusion_transpose/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef7fd4a9d807536ce598ce80ee962018ad37504e
--- /dev/null
+++ b/examples/transpose/confusion_transpose/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/confusion_transpose_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(confusion_transpose_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/confusion_transpose_custom_tiling.cpp
+)
+
+target_compile_options(confusion_transpose_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(confusion_transpose_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(confusion_transpose_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(confusion_transpose_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS confusion_transpose_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/README.md b/examples/transpose/confusion_transpose/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f633d6d6be7c69e3d51a326522b17c3661e208f7
--- /dev/null
+++ b/examples/transpose/confusion_transpose/README.md
@@ -0,0 +1,96 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用ConfusionTranspose高阶API实现confusionTranspose算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于ConfusionTranspose高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 变形 > ConfusionTranspose" 章节。
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+- Atlas 推理系列产品 AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+
+  ConfusionTransposeCustom算子，对输入数据进行数据排布及Reshape操作。
+
+- 算子规格：
+
+  <table>
+  <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ConfusionTransposeCustom</td></tr>
+
+  <tr><td rowspan="3" align="center">算子输入</td></tr>
+  <tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+  <tr><td align="center">x</td><td align="center">1*2*2*4*16*16</td><td align="center">half</td><td align="center">NZ</td></tr>
+  <tr><td rowspan="2" align="center">算子输出</td></tr>
+  <tr><td align="center">y</td><td align="center">1*64*2*32</td><td align="center">half</td><td align="center">ND</td></tr>
+
+  <tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">confusion_transpose_custom</td></tr>
+  </table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[1, 2, 2, 4, 16, 16]，输出y[1, 64, 2, 32]的ConfusionTransposeCustom算子，针对NZ2ND场景，实现1、2轴互换。
+
+输入Tensor { shape:[B, N, H/N/16, S/16, 16, 16], origin_shape:[B, N, S, H/N], format:"NZ", origin_format:"ND"}
+
+输出Tensor { shape:[B, S, N, H/N], origin_shape:[B, S, N, H/N], format:"ND", origin_format:"ND"}
+
+- Kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用ConfusionTranspose高阶API接口完成confusionTranspose计算，得到最终结果，再搬出到外部存储上。
+
+  ConfusionTransposeCustom算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行confusionTranspose计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- Tiling实现
+
+  使用Ascend C提供的GetConfusionTransposeTilingInfo接口，获取所需的Tiling参数，并调用GetConfusionTransposeMaxMinTmpSize接口获取ConfusionTranspose接口计算所需的临时空间大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+      - Atlas 推理系列产品 AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f
--- /dev/null
+++ b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/cmake/npu_lib.cmake b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964
--- /dev/null
+++ b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/main.cpp b/examples/transpose/confusion_transpose/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bd7759de4eca059b07aa5944a13ff67b6ffbfee
--- /dev/null
+++ b/examples/transpose/confusion_transpose/main.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_confusion_transpose_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling);
+#endif
+
+namespace {
+constexpr uint32_t USED_CORE_NUM = 1;
+constexpr uint32_t TILINGDATA_SIZE = 22;
+constexpr uint32_t B = 1;
+constexpr uint32_t N = 2;
+constexpr uint32_t S = 64;
+constexpr uint32_t HNDIV = 32;
+}
+
+extern uint8_t *GenerateTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv);
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t inputSize = B * S * N * HNDIV * sizeof(uint16_t);
+    size_t outputSize = B * S * N * HNDIV * sizeof(uint16_t);
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    uint8_t *buf = GenerateTiling(B, N, S, HNDIV);
+
+    memcpy_s(tiling, tilingSize, buf, tilingSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(confusion_transpose_custom, USED_CORE_NUM, x, y, tiling);
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    free(buf);
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost;
+    uint8_t *xDevice, *yDevice, *tilingDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    uint8_t *buf = GenerateTiling(B, N, S, HNDIV);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, buf, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(confusion_transpose_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, tilingDevice);
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Clean up memory
+    free(buf);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfb1244e4123eb2f522c6142dd453cb270169ff8
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "confusion_transpose_custom_tiling.h"
+
+uint8_t *GetTilingBuf(optiling::ConfusionTransposeCustomTilingData *tilingData) {
+    uint32_t tilingSize = sizeof(optiling::ConfusionTransposeCustomTilingData);
+    uint8_t *buf = (uint8_t *)malloc(tilingSize);
+    tilingData->SaveToBuffer(buf, tilingSize);
+    return buf;
+}
+
+uint8_t *GenerateTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv) {
+    optiling::ConfusionTransposeCustomTilingData tiling;
+    ComputeTiling(b, n, s, hnDiv, tiling);
+    return GetTilingBuf(&tiling);
+}
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5098bf30ba2e001945de42436093ff2e9c13b2f
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
+#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(ConfusionTransposeCustomTilingData)
+    TILING_DATA_FIELD_DEF(uint32_t, b);
+    TILING_DATA_FIELD_DEF(uint32_t, n);
+    TILING_DATA_FIELD_DEF(uint32_t, s);
+    TILING_DATA_FIELD_DEF(uint32_t, hnDiv);
+    TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(ConfusionTransposeCustom, ConfusionTransposeCustomTilingData)
+} // namespace optiling
+
+void ComputeTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv,
+                   optiling::ConfusionTransposeCustomTilingData &tiling) {
+    tiling.set_b(b);
+    tiling.set_n(n);
+    tiling.set_s(s);
+    tiling.set_hnDiv(hnDiv);
+
+    std::vector<int64_t> shapeVec = {b, n, s, hnDiv};
+    ge::Shape srcShape(shapeVec);
+
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, sizeof(uint16_t), 1, maxValue, minValue);
+    const uint32_t stackBufferSize = minValue;
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, sizeof(uint16_t), 1, tiling.confusionTransposeTilingData); 
+}
+
+#endif // EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd6ff38a3a54a5058b24a3d4a8a7671e638bd5e6
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp
@@ -0,0 +1,33 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "confusion_transpose_custom_impl.h"
+#include "kernel_operator.h"
+
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    AscendC::TPipe pipe;
+    MyCustomKernel::KernelConfusionTranspose<half> op;
+    MyCustomKernel::VecTiling tilingData;
+    CopyTiling(&tilingData, tiling);
+    op.Init(x, y, tilingData, &pipe);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ea6ac167c69077365b5676c06c4edc41cb55b
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_IMPL_H
+#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_IMPL_H
+
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+struct VecTiling {
+    uint32_t b;
+    uint32_t n;
+    uint32_t s;
+    uint32_t hnDiv;
+    ConfusionTransposeTiling confusionTransposeTilingData;
+};
+
+template <typename T>
+class KernelConfusionTranspose {
+public:
+    __aicore__ inline KernelConfusionTranspose() {}
+    __aicore__ inline void Init(__gm__ uint8_t *srcGm, __gm__ uint8_t *dstGm, const VecTiling &tilingData, AscendC::TPipe *pipeIn) {
+        ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); });
+        this->b = tilingData.b;
+        this->n = tilingData.n;
+        this->s = tilingData.s;
+        this->hnDiv = tilingData.hnDiv;
+
+        srcGlobal.SetGlobalBuffer((__gm__ T *)srcGm, b * n * s * hnDiv);
+        dstGlobal.SetGlobalBuffer((__gm__ T *)dstGm, b * n * s * hnDiv);
+
+        pipe = pipeIn;
+        pipe->InitBuffer(inQueueSrcVecIn, 1, b * n * s * hnDiv * sizeof(T));
+        pipe->InitBuffer(inQueueSrcVecOut, 1, b * n * s * hnDiv * sizeof(T));
+        this->tiling = tilingData.confusionTransposeTilingData;
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn() {
+        AscendC::LocalTensor<T> srcLocal = inQueueSrcVecIn.AllocTensor<T>();
+        AscendC::DataCopy(srcLocal, srcGlobal, b * n * s * hnDiv);
+        inQueueSrcVecIn.EnQue(srcLocal);
+    }
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<T> srcLocal = inQueueSrcVecIn.DeQue<T>();
+        AscendC::LocalTensor<T> dstLocal = inQueueSrcVecOut.AllocTensor<T>();
+        AscendC::ConfusionTranspose(dstLocal, srcLocal, AscendC::TransposeType::TRANSPOSE_NZ2ND_0213, this->tiling);
+        inQueueSrcVecOut.EnQue<T>(dstLocal);
+        inQueueSrcVecIn.FreeTensor(srcLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        AscendC::LocalTensor<T> dstLocal = inQueueSrcVecOut.DeQue<T>();
+        AscendC::DataCopy(dstGlobal, dstLocal, b * n * s * hnDiv);
+        inQueueSrcVecOut.FreeTensor(dstLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrcVecIn;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> inQueueSrcVecOut;
+    AscendC::GlobalTensor<T> srcGlobal;
+    AscendC::GlobalTensor<T> dstGlobal;
+    uint32_t b = 0;
+    uint32_t n = 0;
+    uint32_t s = 0;
+    uint32_t hnDiv = 0;
+    ConfusionTransposeTiling tiling;
+};
+} // namespace MyCustomKernel
+#endif
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/run.sh b/examples/transpose/confusion_transpose/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cddce28f098bdb4770b57c95990f35f40e6d1fe0
--- /dev/null
+++ b/examples/transpose/confusion_transpose/run.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./confusion_transpose_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./confusion_transpose_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./confusion_transpose_direct_kernel_op
+fi
+
+cd ..
+python3 scripts/verify_data.py output/output.bin output/golden.bin
diff --git a/examples/transpose/confusion_transpose/scripts/gen_data.py b/examples/transpose/confusion_transpose/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..7afe9444f5ec4ee510fdcf4edccd41bd777b7771
--- /dev/null
+++ b/examples/transpose/confusion_transpose/scripts/gen_data.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float16
+
+    b, n, s, hn_div = 1, 2, 64, 32
+    
+    origin_shape = (b, n, s, hn_div)
+
+    origin_data = np.random.uniform(-1000.0, 1000.0, origin_shape).astype(dtype)
+
+    reshape_shape = (b, n, s // 16, 16, hn_div // 16, 16)
+    reshaped_data = origin_data.reshape(reshape_shape)
+
+    transposed_data = np.transpose(reshaped_data, (0, 1, 4, 2, 3, 5))
+
+    golden = np.transpose(origin_data, (0, 2, 1, 3))
+
+    os.system("mkdir -p ./input")
+    transposed_data.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/scripts/verify_data.py b/examples/transpose/confusion_transpose/scripts/verify_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..19122a78e348feed3cbd32480ba43320b12f6cb5
--- /dev/null
+++ b/examples/transpose/confusion_transpose/scripts/verify_data.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import sys
+import numpy as np
+
+# for float16
+RELATIVE_TOL = 1e-3
+ABSOLUTE_TOL = 1e-5
+ERROR_TOL = 1e-3
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=RELATIVE_TOL,
+                                           atol=ABSOLUTE_TOL,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, ERROR_TOL))
+    return error_ratio <= ERROR_TOL
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
\ No newline at end of file
diff --git a/impl/activation/geglu/geglu_tiling.cpp b/impl/activation/geglu/geglu_tiling.cpp
index 91204a8e62112372c853f7e72366e7af3abe6691..3afc5da060e39aef9ffa50bc1ec7bcd970154e9c 100644
--- a/impl/activation/geglu/geglu_tiling.cpp
+++ b/impl/activation/geglu/geglu_tiling.cpp
@@ -57,10 +57,10 @@ void GetGeGLUMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetGeGLUMaxTmpSize(inputSize, typeSize);
 }
 
-void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     HighLevelApiCheck::TypeSizeVerifyingParameters<GE_GELU_GET_TMP_BUFFER>(typeSize, SUPPORT_TYPESIZE);
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? GEGLU_FLOAT_CALC_FAC : GEGLU_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? GEGLU_FLOAT_CALC_FAC : GEGLU_HALF_CALC_FAC;
 }
 } // namespace AscendC
diff --git a/impl/activation/softmax/softmax_tiling.cpp b/impl/activation/softmax/softmax_tiling.cpp
index 401bf3bf1aa1a1c60ab4bb74c8d7bd1f174cbbee..08f5ab196228f2322ab61799ce5859a528211932 100644
--- a/impl/activation/softmax/softmax_tiling.cpp
+++ b/impl/activation/softmax/softmax_tiling.cpp
@@ -617,7 +617,7 @@ void SoftMaxFlashV2TilingFunc(const ge::Shape& srcShape, const uint32_t dataType
 
     if (isFlashOutputBrc && (softmaxFlashTiling.get_rangeM() > 1 || softmaxFlashTiling.get_tailM() != 0)) {
         ASCENDC_HOST_ASSERT((softmaxFlashTiling.get_reduceM() % (SOFTMAX_DEFAULT_BLK_SIZE / dataTypeSize1) == 0), return,
-            "When dataTypeSize1(%d) is float(or half), softmaxFlashTiling.reduceM(%d) must be a multiple of 8(or 16), "
+            "When dataTypeSize1(%u) is float(or half), softmaxFlashTiling.reduceM(%u) must be a multiple of 8(or 16), "
             "Adjust the input parameter -> localWorkSpaceSize.\n", dataTypeSize1, softmaxFlashTiling.get_reduceM());        
     }
 }
diff --git a/impl/activation/swish/swish_common_impl.h b/impl/activation/swish/swish_common_impl.h
index 047005d67202e379e2c29e7ac4074d8600c40474..b682cb6d6bc4c262c5390bdda3bd5442e65b4d1a 100644
--- a/impl/activation/swish/swish_common_impl.h
+++ b/impl/activation/swish/swish_common_impl.h
@@ -52,14 +52,6 @@ __aicore__ inline __inout_pipe__(V) void SwishCompute(
     if ASCEND_IS_AIC {
         return;
     }
-#if ASCENDC_CPU_DEBUG
-    bool ret = (dataSize <= srcLocal.GetSize()) && (dataSize <= dstLocal.GetSize()) && (dataSize > 0);
-    ASCENDC_ASSERT(
-        ret, { KERNEL_LOG(KERNEL_ERROR, "DataSize must bigger than 0 and smaller than or equal to src&dst tensor."); });
-
-    ret = (std::is_same<T, half>::value) || (std::is_same<T, float>::value);
-    ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "DataType must be half or float."); });
-#endif
     T scalar = static_cast<T>(static_cast<float>(-1) * static_cast<float>(scalarValue));
 #if __CCE_AICORE__ >= 200
     SetMaskCount();
diff --git a/impl/math/acosh/acosh_tiling.cpp b/impl/math/acosh/acosh_tiling.cpp
index 200618a8e49f796a38ed03e5c9a72cbc11746b1e..08ed5ef07f1a2354ad8982df7f74073dab434a5b 100644
--- a/impl/math/acosh/acosh_tiling.cpp
+++ b/impl/math/acosh/acosh_tiling.cpp
@@ -48,9 +48,9 @@ void GetAcoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetAcoshMaxTmpSize(inputSize, typeSize);
 }
 
-void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? ACOSH_FLOAT_CALC_PROC : ACOSH_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? ACOSH_FLOAT_CALC_PROC : ACOSH_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/asinh/asinh_tiling.cpp b/impl/math/asinh/asinh_tiling.cpp
index e0a917feda6949135c1024f9db656c12b10326ce..3d87ed0fc14edb7b212350f9a9d9d13a0997f67f 100644
--- a/impl/math/asinh/asinh_tiling.cpp
+++ b/impl/math/asinh/asinh_tiling.cpp
@@ -48,9 +48,9 @@ void GetAsinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetAsinhMaxTmpSize(inputSize, typeSize);
 }
 
-void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? ASINH_FLOAT_CALC_PROC : ASINH_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? ASINH_FLOAT_CALC_PROC : ASINH_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/atan/atan_common_impl.h b/impl/math/atan/atan_common_impl.h
index c727294d162272b3cd8119a61c9fec5ecd7d0729..80e2d4304a028665427997edaaf84249d0e29312 100644
--- a/impl/math/atan/atan_common_impl.h
+++ b/impl/math/atan/atan_common_impl.h
@@ -270,7 +270,7 @@ __aicore__ inline void AtanImpl(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    AtanImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 }  // namespace AscendC
 
diff --git a/impl/math/atan/atan_tiling_impl.cpp b/impl/math/atan/atan_tiling_impl.cpp
index b58abf939f5b4b164554bb1b6f22669ea98d5247..6ffcc20633e2a140712f8a10f648473a67703724 100644
--- a/impl/math/atan/atan_tiling_impl.cpp
+++ b/impl/math/atan/atan_tiling_impl.cpp
@@ -47,9 +47,9 @@ void GetAtanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetAtanMaxTmpSize(inputSize, typeSize);
 }
 
-void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? ATAN_FLOAT_CALC_FAC : ATAN_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? ATAN_FLOAT_CALC_FAC : ATAN_HALF_CALC_FAC;
 }
 } // AscendC
\ No newline at end of file
diff --git a/impl/math/atanh/atanh_tiling.cpp b/impl/math/atanh/atanh_tiling.cpp
index 493ebe9d53541fe8117f99fb7fe88552c20a0826..2b437a733f9874172219031e850956b05a0e6dd4 100644
--- a/impl/math/atanh/atanh_tiling.cpp
+++ b/impl/math/atanh/atanh_tiling.cpp
@@ -44,9 +44,9 @@ void GetAtanhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetAtanhMaxTmpSize(inputSize, typeSize);
 }
 
-void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? ATANH_FLOAT_CALC_PROC : ATANH_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? ATANH_FLOAT_CALC_PROC : ATANH_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/axpy/axpy_tiling_impl.cpp b/impl/math/axpy/axpy_tiling_impl.cpp
index 9d22afcd00e6575dcf71c050672028295dd0ef77..6df81faddc24783cbf6f5e2f2f314605c61dd1b7 100644
--- a/impl/math/axpy/axpy_tiling_impl.cpp
+++ b/impl/math/axpy/axpy_tiling_impl.cpp
@@ -53,9 +53,9 @@ void GetAxpyMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co
     maxValue = GetAxpyMaxTmpSize(inputSize, typeSize);
 }
 
-void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf)
+void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? AXPY_FLOAT_CALC_PROC : AXPY_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? AXPY_FLOAT_CALC_PROC : AXPY_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/ceil/ceil_tiling_impl.cpp b/impl/math/ceil/ceil_tiling_impl.cpp
index 8d76e57c6a86e37a6d7fb4b94dcd55fd801e6c64..d5ec08eb75e9210a68f239221075c089243f3d1f 100644
--- a/impl/math/ceil/ceil_tiling_impl.cpp
+++ b/impl/math/ceil/ceil_tiling_impl.cpp
@@ -45,9 +45,9 @@ void GetCeilMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetCeilMaxTmpSize(inputSize, typeSize);
 }
 
-void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? CEIL_FLOAT_CALC_FAC : CEIL_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? CEIL_FLOAT_CALC_FAC : CEIL_HALF_CALC_FAC;
 }
 }  // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/clamp/clamp_tiling_impl.cpp b/impl/math/clamp/clamp_tiling_impl.cpp
index 0df62079a1bfee80a9fdc83b7e5a8e652d88291b..1ce8255eec81c0b853b38e3d09423ba7e2036662 100644
--- a/impl/math/clamp/clamp_tiling_impl.cpp
+++ b/impl/math/clamp/clamp_tiling_impl.cpp
@@ -37,10 +37,10 @@ void GetClampMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     }
 }
 
-void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     (void)typeSize;
     extraBuf = 0;
-    maxLiveNodeCnt = CLAMP_CALC_FAC;
+    maxLiveNodeCount = CLAMP_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/cos/cos_tiling_impl.cpp b/impl/math/cos/cos_tiling_impl.cpp
index c4cf8bcecb2e4fc793d831004d6b5abfb0503d2b..1f7d3ae5debe38277fb8e7150d9e4e7ce28e4cb6 100644
--- a/impl/math/cos/cos_tiling_impl.cpp
+++ b/impl/math/cos/cos_tiling_impl.cpp
@@ -62,9 +62,9 @@ void GetCosMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
     maxValue = GetCosMaxTmpSize(inputSize, typeSize, isReuseSource);
 }
 
-void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? COS_FLOAT_NOREUSE_CALC_FAC : COS_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? COS_FLOAT_NOREUSE_CALC_FAC : COS_HALF_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/cosh/cosh_common_impl.h b/impl/math/cosh/cosh_common_impl.h
index 70dad85a38e06b0063168e36813ce1a9ec9d2ba1..b45df24f32bfd9b2e4112504507cf40450b9fc59 100644
--- a/impl/math/cosh/cosh_common_impl.h
+++ b/impl/math/cosh/cosh_common_impl.h
@@ -157,7 +157,7 @@ __aicore__ inline void CoshImpl(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    CoshImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 } // namespace AscendC
 
diff --git a/impl/math/erfc/erfc_common_impl.h b/impl/math/erfc/erfc_common_impl.h
index ce63b39f9da6f366c2a018dd465ca5f6eb84da5c..a5706c2ffd2a042fe6274f930fce5dd1dd6b1f6b 100644
--- a/impl/math/erfc/erfc_common_impl.h
+++ b/impl/math/erfc/erfc_common_impl.h
@@ -346,7 +346,7 @@ __aicore__ inline void ErfcImpl(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    ErfcImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 } // namespace AscendC
 #endif // IMPL_MATH_ERFC_ERFC_COMMON_IMPL_H
diff --git a/impl/math/erfc/erfc_tiling_impl.cpp b/impl/math/erfc/erfc_tiling_impl.cpp
index 00d5ecc284167b6ea7e61a089a4834629500b147..44bfeea0198bf62f5d3428ae350ca92bfbae7007 100644
--- a/impl/math/erfc/erfc_tiling_impl.cpp
+++ b/impl/math/erfc/erfc_tiling_impl.cpp
@@ -48,9 +48,9 @@ void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetErfcMaxTmpSize(inputSize, typeSize);
 }
 
-void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? ERFC_FLOAT_CALC_FAC : ERFC_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? ERFC_FLOAT_CALC_FAC : ERFC_HALF_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/floor/floor_tiling_impl.cpp b/impl/math/floor/floor_tiling_impl.cpp
index b959cb64dd4c76ff7e072530abc2a5d579032f45..ac50fc2c4208e96a016189fd4cfa19f4cdb0ff2f 100644
--- a/impl/math/floor/floor_tiling_impl.cpp
+++ b/impl/math/floor/floor_tiling_impl.cpp
@@ -44,9 +44,9 @@ void GetFloorMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetFloorMaxTmpSize(inputSize, typeSize);
 }
 
-void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? 0 : FLOOR_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? 0 : FLOOR_HALF_CALC_FAC;
 }
 }  // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/fmod/fmod_common_impl.h b/impl/math/fmod/fmod_common_impl.h
index 264afdebbff4c93ec9bb5a49ceac2bee4cbb8fcf..1cec4fa9c184afa183b6d660302289344a7015cc 100644
--- a/impl/math/fmod/fmod_common_impl.h
+++ b/impl/math/fmod/fmod_common_impl.h
@@ -94,10 +94,10 @@ __aicore__ inline void FmodImpl(const LocalTensor<T> &dstTensor, const LocalTens
         return;
     }
 
-    constexpr uint32_t maxLiveNodeCnt = 8; // The corresponding maxLiveNodeCnt for half is 8, extra is 3 * 2 + trunc 2.
+    constexpr uint32_t maxLiveNodeCount = 8; // The corresponding maxLiveNodeCount for half is 8, extra is 3 * 2 + trunc 2.
     uint32_t bufferSize = sharedTmpBuffer.GetSize();
     uint32_t stackSize =
-        bufferSize / sizeof(T) / maxLiveNodeCnt / ONE_BLK_SIZE * ONE_BLK_SIZE; // divided by how many counts
+        bufferSize / sizeof(T) / maxLiveNodeCount / ONE_BLK_SIZE * ONE_BLK_SIZE; // divided by how many counts
     CheckTmpBufferSize(stackSize, 0, bufferSize);
     ASCENDC_ASSERT((src0Tensor.GetSize() > 0), { KERNEL_LOG(KERNEL_ERROR, "src0Tensor size must > 0!"); });
     stackSize = stackSize > src0Tensor.GetSize() ? src0Tensor.GetSize() : stackSize; // No more than localTensor
diff --git a/impl/math/fmod/fmod_tiling_impl.cpp b/impl/math/fmod/fmod_tiling_impl.cpp
index 1f68f81dc15c94d79799f2a3cc7f23f1fb57e1d8..103ba893cc66cf1336bf1960a2d489964f405289 100644
--- a/impl/math/fmod/fmod_tiling_impl.cpp
+++ b/impl/math/fmod/fmod_tiling_impl.cpp
@@ -50,10 +50,10 @@ void GetFmodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     minValue = GetFmodMinTmpSize(typeSize, truncCalPro);
 }
 
-void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    GetTruncTmpBufferFactorSize(typeSize, maxLiveNodeCnt, extraBuf);
-    maxLiveNodeCnt += (typeSize == sizeof(float)) ? FMOD_FLOAT_TENSOR_NUM : FMOD_HALF_TENSOR_NUM;
+    GetTruncTmpBufferFactorSize(typeSize, maxLiveNodeCount, extraBuf);
+    maxLiveNodeCount += (typeSize == sizeof(float)) ? FMOD_FLOAT_TENSOR_NUM : FMOD_HALF_TENSOR_NUM;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/frac/frac_tiling_impl.cpp b/impl/math/frac/frac_tiling_impl.cpp
index 5fa01948c7c2d4cbdcd832ddd2b95467150b7ee8..caafa6e5e08205ac226867f9f2bec6c5659419bd 100644
--- a/impl/math/frac/frac_tiling_impl.cpp
+++ b/impl/math/frac/frac_tiling_impl.cpp
@@ -49,9 +49,9 @@ void GetFracMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetFracMaxTmpSize(inputSize, typeSize);
 }
 
-void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? FRAC_FLOAT_CALC_FAC : FRAC_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? FRAC_FLOAT_CALC_FAC : FRAC_HALF_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/lgamma/lgamma_common_basic_impl.h b/impl/math/lgamma/lgamma_common_basic_impl.h
index 57b3a7c44fb40751d24951cbd9cefad0d1197c2c..87f5b7553d2f045785f11d0ffc1f0d6a3025af30 100644
--- a/impl/math/lgamma/lgamma_common_basic_impl.h
+++ b/impl/math/lgamma/lgamma_common_basic_impl.h
@@ -54,7 +54,7 @@ __aicore__ inline void LGammaCalcMulAdd(const LocalTensor<float> &tmp, const Loc
 }
 
 // cal result of 0.7 <= x < 1.5 on tmp1, Ln return inf when x is 0
-__aicore__ inline void LGamma007(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGamma007(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = MulAdd(x)
     LGammaCalcMulAdd(params.tmp1, src, params.unaryParams, params.binaryParams, params007, params007Len);
@@ -71,7 +71,7 @@ __aicore__ inline void LGamma007(const LocalTensor<float> &src, const LGammaFPar
 }
 
 // cal result of 0.7 <= x < 1.5 on tmp2
-__aicore__ inline void LGamma0715(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGamma0715(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = 1.0 - x
     Muls<float, false>(params.tmp1, src, fn1, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -84,7 +84,7 @@ __aicore__ inline void LGamma0715(const LocalTensor<float> &src, const LGammaFPa
 }
 
 // cal result of 1.5 <= x < 3 on tmp2
-__aicore__ inline void LGamma153(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGamma153(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = x - 2.0
     Adds<float, false>(params.tmp1, src, fn2, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -95,7 +95,7 @@ __aicore__ inline void LGamma153(const LocalTensor<float> &src, const LGammaFPar
 }
 
 // cal result of 3 <= x < 5.8 on tmp3
-__aicore__ inline void LGamma358(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGamma358(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = x - 3.0
     Adds<float, false>(params.tmp1, src, fn3, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -123,7 +123,7 @@ __aicore__ inline void LGamma358(const LocalTensor<float> &src, const LGammaFPar
 }
 
 // cal result of x >= 5.8 on tmp1
-__aicore__ inline void LGamma58(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGamma58(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = ln(x) * 0.5
     Ln<float, false>(params.tmp1, src, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -167,7 +167,7 @@ __aicore__ inline void LGamma58(const LocalTensor<float> &src, const LGammaFPara
 
 // gen mask, src < scalar set 1, other set 0, used tmp1
 __aicore__ inline void LGammaGenLTMask(
-    const LocalTensor<uint8_t> &mask, const LocalTensor<float> &src, const LGammaFParams &params, const float scalar)
+    const LocalTensor<uint8_t> &mask, const LocalTensor<float> &src, const LGammaParams &params, const float scalar)
 {
     Duplicate<float, false>(params.tmp1, scalar, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
     PipeBarrier<PIPE_V>();
@@ -179,7 +179,7 @@ __aicore__ inline void LGammaGenLTMask(
 
 // gen mask, src >= scalar set 1, other set 0, used tmp1
 __aicore__ inline void LGammaGenGEMask(
-    const LocalTensor<uint8_t> &mask, const LocalTensor<float> &src, const LGammaFParams &params, const float scalar)
+    const LocalTensor<uint8_t> &mask, const LocalTensor<float> &src, const LGammaParams &params, const float scalar)
 {
     Duplicate<float, false>(params.tmp1, scalar, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
     PipeBarrier<PIPE_V>();
@@ -191,7 +191,7 @@ __aicore__ inline void LGammaGenGEMask(
 
 // gen mask on params.mask, min > src >= max set 1, other set 0, used tmp1
 __aicore__ inline void LGammaGenRangeMask(
-    const LocalTensor<float> &src, const LGammaFParams &params, const float min, const float max)
+    const LocalTensor<float> &src, const LGammaParams &params, const float min, const float max)
 {
     LGammaGenLTMask(params.mask, src, params, max);
     LGammaGenGEMask(params.tmpMask1, src, params, min);
@@ -209,7 +209,7 @@ __aicore__ inline void LGammaGenRangeMask(
 
 // Select the value of src at mask 1, and accumulate the result onto dst, used tmp1
 __aicore__ inline void LGammaSelect(const LocalTensor<float> &dst, const LocalTensor<float> &src,
-    const LocalTensor<uint8_t> &mask, const LGammaFParams &params)
+    const LocalTensor<uint8_t> &mask, const LGammaParams &params)
 {
     SetCmpMask<float>(params.tmpScalar);
     PipeBarrier<PIPE_V>();
@@ -220,7 +220,7 @@ __aicore__ inline void LGammaSelect(const LocalTensor<float> &dst, const LocalTe
 }
 
 // tmp6 is |x|, res on tmp5
-__aicore__ inline void LGammaPositive(const LGammaFParams &params)
+__aicore__ inline void LGammaPositive(const LGammaParams &params)
 {
     Duplicate<float, false>(params.tmp5, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
     PipeBarrier<PIPE_V>();
@@ -258,7 +258,7 @@ __aicore__ inline void LGammaPositive(const LGammaFParams &params)
 }
 
 // cal tmp val on tmp2, mask for odd, tmpMask1 for even, tmp6 is |x|
-__aicore__ inline void LGammaCalNegTmp1(const LGammaFParams &params)
+__aicore__ inline void LGammaCalNegTmp1(const LGammaParams &params)
 {
     // tmp2 = floor(tmp6 + tmp6 + 0.5)
     Add<float, false>(params.tmp2, params.tmp6, params.tmp6, MASK_PLACEHOLDER, 1, params.binaryParams);
@@ -298,7 +298,7 @@ __aicore__ inline void LGammaCalNegTmp1(const LGammaFParams &params)
 }
 
 // input is tmp2, mask for odd, tmpMask1 for even, result on tmp2
-__aicore__ inline void LGammaCalNegTmp2(const LGammaFParams &params)
+__aicore__ inline void LGammaCalNegTmp2(const LGammaParams &params)
 {
     // tmp3 = tmp2 * tmp2
     Mul<float, false>(params.tmp3, params.tmp2, params.tmp2, MASK_PLACEHOLDER, 1, params.binaryParams);
@@ -325,7 +325,7 @@ __aicore__ inline void LGammaCalNegTmp2(const LGammaFParams &params)
 }
 
 // Get final result, tmp3 save for inf
-__aicore__ inline void LGammaCalNegTmp3(const LGammaFParams &params)
+__aicore__ inline void LGammaCalNegTmp3(const LGammaParams &params)
 {
     // tmp2 = 1.1447298526763916015625 - ln(|tmp2| * tmp6) - tmp5
     Abs<float, false>(params.tmp1, params.tmp2, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -347,7 +347,7 @@ __aicore__ inline void LGammaCalNegTmp3(const LGammaFParams &params)
 }
 
 // Near zero negative, input as |x|, result on tmp1
-__aicore__ inline void LGammaCalMinNeg(const LocalTensor<float> &src, const LGammaFParams &params)
+__aicore__ inline void LGammaCalMinNeg(const LocalTensor<float> &src, const LGammaParams &params)
 {
     // tmp1 = -ln(src)
     Ln<float, false>(params.tmp1, src, MASK_PLACEHOLDER, 1, params.unaryParams);
@@ -357,7 +357,7 @@ __aicore__ inline void LGammaCalMinNeg(const LocalTensor<float> &src, const LGam
 }
 
 // cal for x < 0, result on tmp4, tmp6 is |x|, tmp5 is pos res
-__aicore__ inline void LGammaNegative(const LGammaFParams &params)
+__aicore__ inline void LGammaNegative(const LGammaParams &params)
 {
     Duplicate<float, false>(params.tmp4, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
     PipeBarrier<PIPE_V>();
@@ -431,7 +431,7 @@ __aicore__ inline void LGammaNegative(const LGammaFParams &params)
 
 template <bool isReuseSource = false>
 __aicore__ inline void LGammaInitFParams(
-    const LocalTensor<float> &tmp, const uint32_t splitSize, const LocalTensor<float> &src, LGammaFParams &params)
+    const LocalTensor<float> &tmp, const uint32_t splitSize, const LocalTensor<float> &src, LGammaParams &params)
 {
     params.tmp1 = tmp;
     params.tmp2 = tmp[splitSize];
@@ -464,6 +464,37 @@ __aicore__ inline void LGammaInitFParams(
 
     params.splitSize = splitSize;
 }
+
+template <bool isReuseSource = false>
+__aicore__ inline void LGammaInitHParams(
+    const LocalTensor<float> &tmp, const uint32_t splitSize, const LocalTensor<half> &src, LGammaParams &params)
+{
+    params.tmp1 = tmp;
+    params.tmp2 = tmp[splitSize];
+    params.tmp3 = params.tmp2[splitSize];
+    params.tmp4 = params.tmp3[splitSize];
+    params.tmp5 = params.tmp4[splitSize];
+    params.tmpScalar = params.tmp5[splitSize];
+    params.mask = params.tmpScalar[splitSize].ReinterpretCast<uint8_t>();
+    params.tmpMask1 = params.mask[splitSize];
+    params.tmpMask2 = params.tmpMask1[splitSize];
+    params.tmpMask3 = params.tmpMask2[splitSize];
+    params.tmp6 = params.tmpScalar[splitSize * i2];
+
+    params.tmp1.SetSize(splitSize);
+    params.tmp2.SetSize(splitSize);
+    params.tmp3.SetSize(splitSize);
+    params.tmp4.SetSize(splitSize);
+    params.tmp5.SetSize(splitSize);
+    params.mask.SetSize(splitSize);
+    params.tmpMask1.SetSize(splitSize);
+    params.tmpMask2.SetSize(splitSize);
+    params.tmpMask3.SetSize(splitSize);
+    params.tmpScalar.SetSize(splitSize);
+    params.tmp6.SetSize(splitSize * i6);
+
+    params.splitSize = splitSize;
+}
 }  // namespace AscendC
 #endif
 #endif  // IMPL_MATH_LGAMMA_LGAMMA_COMMOM_BASIC_IMPL_H
\ No newline at end of file
diff --git a/impl/math/lgamma/lgamma_common_impl.h b/impl/math/lgamma/lgamma_common_impl.h
index 6be655e66c82e9ff7ab6ac677801a71d37bc9ffd..e53a6865e4b7df1d28a7e16f6a77936d2a33976b 100644
--- a/impl/math/lgamma/lgamma_common_impl.h
+++ b/impl/math/lgamma/lgamma_common_impl.h
@@ -265,102 +265,73 @@ __aicore__ inline void LGammaSelectINF(const LocalTensor<float> &dstTensor, cons
     PipeBarrier<PIPE_V>();
 }
 
-__aicore__ inline void LgammaComputeImpl(const LocalTensor<half> &dstTensor, const LocalTensor<half> &srcTensor,
-    const LocalTensor<float> &tmpTensor, const uint32_t &splitSize, bool isReuseSource)
+__aicore__ inline void LgammaComputeImpl(const LocalTensor<half> &dstTensor, const LocalTensor<half> &srcTensor, 
+    LGammaParams &params, const uint32_t splitSize)
 {
-    (void)isReuseSource;
-    const UnaryRepeatParams unaryParams;
-    const BinaryRepeatParams binParams;
-
     // half-->float
-    LocalTensor<float> restmpBuffer = tmpTensor;
-    LocalTensor<float> srctmpBuffer = restmpBuffer[splitSize];
-    Duplicate<float, false>(restmpBuffer, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
+    Duplicate<float, false>(params.tmp1, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
     PipeBarrier<PIPE_V>();
-    Cast<float, half, false>(srctmpBuffer, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
+    Cast<float, half, false>(params.tmp2, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1,
         {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE});
     PipeBarrier<PIPE_V>();
-    // compute result x >= 0
-    LocalTensor<float> TensorPosRes = srctmpBuffer[splitSize];
-    // compute result x < 0
-    LocalTensor<float> TensorNegRes = TensorPosRes[splitSize];
-    LocalTensor<float> tmp1Tensor = TensorNegRes[splitSize];
     // all 0 tensor
-    LocalTensor<float> tmpScalar = tmp1Tensor[splitSize];
-    Duplicate<float, false>(tmpScalar, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
-    PipeBarrier<PIPE_V>();
-    LocalTensor<uint8_t> MaskPos = tmpScalar[splitSize].ReinterpretCast<uint8_t>();
-    LocalTensor<uint8_t> MaskNeg = MaskPos[splitSize];
-    LocalTensor<uint8_t> tmpMask1 = MaskNeg[splitSize];
-    LocalTensor<uint8_t> tmpMask2 = tmpMask1[splitSize];
-    LocalTensor<float> stackTensor = tmpScalar[splitSize*i2];
-
-    restmpBuffer.SetSize(splitSize);
-    srctmpBuffer.SetSize(splitSize);
-    TensorPosRes.SetSize(splitSize);
-    TensorNegRes.SetSize(splitSize);
-    tmp1Tensor.SetSize(splitSize);
-    tmpScalar.SetSize(splitSize);
-    MaskNeg.SetSize(splitSize);
-    MaskPos.SetSize(splitSize);
-    tmpMask1.SetSize(splitSize);
-    tmpMask2.SetSize(splitSize);
-    stackTensor.SetSize(splitSize * i6);
+    Duplicate<float, false>(params.tmpScalar, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE);
+    PipeBarrier<PIPE_V>();
 
     // compute result x >= 0
-    LgammaComputePosHalf(TensorPosRes, srctmpBuffer, stackTensor, splitSize);
+    LgammaComputePosHalf(params.tmp3, params.tmp2, params.tmp6, splitSize);
     PipeBarrier<PIPE_V>();
     // compute mask x >= 0
-    LGammaGenGEMaskHalf(MaskPos, srctmpBuffer, tmp1Tensor, 0.0f, splitSize);
+    LGammaGenGEMaskHalf(params.mask, params.tmp2, params.tmp5, 0.0f, splitSize);
     PipeBarrier<PIPE_V>();
-    LGammaSelectHalf(restmpBuffer, TensorPosRes, MaskPos, tmp1Tensor, tmpScalar);
+    LGammaSelectHalf(params.tmp1, params.tmp3, params.mask, params.tmp5, params.tmpScalar);
     PipeBarrier<PIPE_V>();
 
     // compute result x < 0
-    LgammaComputeNegHalf(TensorNegRes, srctmpBuffer, stackTensor, splitSize);
+    LgammaComputeNegHalf(params.tmp4, params.tmp2, params.tmp6, splitSize);
     PipeBarrier<PIPE_V>();
     // compute mask x < 0
-    LGammaGenLTMaskHalf(MaskNeg, srctmpBuffer, tmp1Tensor, 0.0f, splitSize);
+    LGammaGenLTMaskHalf(params.tmpMask1, params.tmp2, params.tmp5, 0.0f, splitSize);
     PipeBarrier<PIPE_V>();
-    LGammaSelectHalf(restmpBuffer, TensorNegRes, MaskNeg, tmp1Tensor, tmpScalar);
+    LGammaSelectHalf(params.tmp1, params.tmp4, params.tmpMask1, params.tmp5, params.tmpScalar);
     PipeBarrier<PIPE_V>();
 
     // for nan
-    SetVectorMask<float>(0, ConstCeil(splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE));
-    Not<uint16_t, false>(tmpMask1.ReinterpretCast<uint16_t>(), MaskPos.ReinterpretCast<uint16_t>(),
-                         MASK_PLACEHOLDER, 1, unaryParams);
-    Not<uint16_t, false>(tmpMask2.ReinterpretCast<uint16_t>(), MaskNeg.ReinterpretCast<uint16_t>(),
-                         MASK_PLACEHOLDER, 1, unaryParams);
+    SetVectorMask<float>(0, ConstCeil(params.splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE));
+    Not<uint16_t, false>(params.tmpMask2.ReinterpretCast<uint16_t>(), params.mask.ReinterpretCast<uint16_t>(),
+                         MASK_PLACEHOLDER, 1, params.unaryParams);
+    Not<uint16_t, false>(params.tmpMask3.ReinterpretCast<uint16_t>(), params.tmpMask1.ReinterpretCast<uint16_t>(),
+                         MASK_PLACEHOLDER, 1, params.unaryParams);
     PipeBarrier<PIPE_V>();
-    And<uint16_t, false>(tmpMask1.ReinterpretCast<uint16_t>(), tmpMask1.ReinterpretCast<uint16_t>(),
-        tmpMask2.ReinterpretCast<uint16_t>(), MASK_PLACEHOLDER, 1, binParams);
+    And<uint16_t, false>(params.tmpMask2.ReinterpretCast<uint16_t>(), params.tmpMask1.ReinterpretCast<uint16_t>(),
+        params.tmpMask3.ReinterpretCast<uint16_t>(), MASK_PLACEHOLDER, 1, params.binaryParams);
 
     PipeBarrier<PIPE_V>();
-    SetVectorMask<float>(0, splitSize);
-    LGammaSelectHalf(restmpBuffer, srctmpBuffer, tmpMask1, tmp1Tensor, tmpScalar);
+    SetVectorMask<float>(0, params.splitSize);
+    LGammaSelectHalf(params.tmp1, params.tmp2, params.tmpMask2, params.tmp4, params.tmpScalar);
     PipeBarrier<PIPE_V>();
 
     // for inf/-inf
-    Abs<float, false>(srctmpBuffer, srctmpBuffer, MASK_PLACEHOLDER, 1, unaryParams);
+    Abs<float, false>(params.tmp2, params.tmp2, MASK_PLACEHOLDER, 1, params.unaryParams);
     PipeBarrier<PIPE_V>();
     // generate |x| >= 65504 mask
-    LGammaGenGEMaskHalf(tmpMask1, srctmpBuffer, tmp1Tensor, 65504.0f, splitSize);
+    LGammaGenGEMaskHalf(params.tmpMask2, params.tmp2, params.tmp4, 65504.0f, splitSize);
     PipeBarrier<PIPE_V>();
-    SetVectorMask<float>(0, ConstCeil(splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE));
-    Not<uint16_t, false>(tmpMask2.ReinterpretCast<uint16_t>(), tmpMask1.ReinterpretCast<uint16_t>(),
-                         MASK_PLACEHOLDER, 1, unaryParams);
+    SetVectorMask<float>(0, ConstCeil(params.splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE));
+    Not<uint16_t, false>(params.tmpMask3.ReinterpretCast<uint16_t>(), params.tmpMask2.ReinterpretCast<uint16_t>(),
+                         MASK_PLACEHOLDER, 1, params.unaryParams);
     PipeBarrier<PIPE_V>();
-    SetVectorMask<float>(0, splitSize);
-    LGammaSelectINF(restmpBuffer, restmpBuffer, tmpMask2, tmp1Tensor, tmpScalar);
+    SetVectorMask<float>(0, params.splitSize);
+    LGammaSelectINF(params.tmp1, params.tmp1, params.tmpMask3, params.tmp4, params.tmpScalar);
     PipeBarrier<PIPE_V>();
 
     // float-->half
-    Cast<half, float, false>(dstTensor, restmpBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER,
+    Cast<half, float, false>(dstTensor, params.tmp1, RoundMode::CAST_NONE, MASK_PLACEHOLDER,
         1, {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
 }
 
 __aicore__ inline void LgammaComputeImpl(
-    const LocalTensor<float> &dst, const LocalTensor<float> &src, LGammaFParams &params)
+    const LocalTensor<float> &dst, const LocalTensor<float> &src, LGammaParams &params)
 {
     // Gen masks with x >= 0 and < 0, which will not be overwritten in the future
     LGammaGenGEMask(params.tmpMask2, src, params, 0.0f);
@@ -423,25 +394,30 @@ __aicore__ inline void LgammaCompute(const LocalTensor<half> &dstTensor, const L
     CheckTmpBufferSize(tmpBufferSize, 0, bufferSize);
 
     LocalTensor<float> tmpBuffer = sharedTmpBuffer.ReinterpretCast<float>();
-    uint32_t stackSize = 0;
+    uint32_t splitSize = 0;
 
-    stackSize = tmpBufferSize / LGAMMA_HALF_CALC_PROCEDURE / ONE_BLK_SIZE * ONE_BLK_SIZE;  // 32 byte
-    CheckTmpBufferSize(stackSize, 0, bufferSize);
+    splitSize = tmpBufferSize / LGAMMA_HALF_CALC_PROCEDURE / ONE_BLK_SIZE * ONE_BLK_SIZE;  // 32 byte
+    CheckTmpBufferSize(splitSize, 0, bufferSize);
+
+    // init params
+    LGammaParams params;
+    LGammaInitHParams<isReuseSource>(tmpBuffer, splitSize, srcTensor, params);
 
-    const uint32_t round = calCount / stackSize;
-    const uint32_t tail = calCount % stackSize;
+    const uint32_t round = calCount / splitSize;
+    const uint32_t tail = calCount % splitSize;
     SetMaskCount();
-    SetVectorMask<half, MaskMode::COUNTER>(0, stackSize);
+    SetVectorMask<half, MaskMode::COUNTER>(0, splitSize);
     uint32_t offset = 0;
     for (uint32_t i = 0; i < round; i++) {
-        LgammaComputeImpl(dstTensor[offset], srcTensor[offset], tmpBuffer, stackSize, isReuseSource);
-        offset = offset + stackSize;
+        LgammaComputeImpl(dstTensor[offset], srcTensor[offset], params, splitSize);
+        offset = offset + splitSize;
     }
 
     if (tail > 0) {
         SetVectorMask<half, MaskMode::COUNTER>(0, tail);
+        params.splitSize = tail;
         LgammaComputeImpl(
-            dstTensor[round * stackSize], srcTensor[round * stackSize], tmpBuffer, stackSize, isReuseSource);
+            dstTensor[round * splitSize], srcTensor[round * splitSize], params, splitSize);
     }
     SetMaskNorm();
     AscendCUtils::ResetMask();
@@ -469,7 +445,7 @@ __aicore__ inline void LgammaCompute(const LocalTensor<float> &dst, const LocalT
     CheckTmpBufferSize(splitSize, 0, tmpBufferSize);
 
     // init params
-    LGammaFParams params;
+    LGammaParams params;
     LGammaInitFParams<isReuseSource>(tmpBuffer, splitSize, src, params);
 
     const uint32_t loopCount = calCount / splitSize;
@@ -480,7 +456,6 @@ __aicore__ inline void LgammaCompute(const LocalTensor<float> &dst, const LocalT
         LgammaComputeImpl(dst[i * splitSize], src[i * splitSize], params);
     }
     if (calcTail > 0) {
-        calcTail = (calcTail + ONE_BYTE_BIT_SIZE - 1U) / ONE_BYTE_BIT_SIZE * ONE_BYTE_BIT_SIZE;
         SetVectorMask<float>(0, calcTail);
         params.splitSize = calcTail;
         LgammaComputeImpl(dst[loopCount * splitSize], src[loopCount * splitSize], params);
diff --git a/impl/math/lgamma/lgamma_common_utils.h b/impl/math/lgamma/lgamma_common_utils.h
index 123389918a72bd4e55c2f326457404d54b252926..bfc535250e92fd14edff117244a1755361ab9d8d 100644
--- a/impl/math/lgamma/lgamma_common_utils.h
+++ b/impl/math/lgamma/lgamma_common_utils.h
@@ -97,8 +97,8 @@ constexpr float negParamsEven[negParamsEvenLen] = {
     -0.000195746586541645228862762451171875, 0.0083327032625675201416015625, -0.16666662693023681640625};
 }  // namespace
 
-struct LGammaFParams {
-    __aicore__ LGammaFParams()
+struct LGammaParams {
+    __aicore__ LGammaParams()
     {}
     LocalTensor<float> tmp1;
     LocalTensor<float> tmp2;
diff --git a/impl/math/log/log_tiling.cpp b/impl/math/log/log_tiling.cpp
index 43c9542939d2add606fe54f4f06fab4b49bf3995..389db54e3948b45566be5212a3ddb990647105a0 100644
--- a/impl/math/log/log_tiling.cpp
+++ b/impl/math/log/log_tiling.cpp
@@ -51,11 +51,11 @@ void GetLogMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
     minValue = 0;
 }
 
-void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     (void)typeSize;
     extraBuf = 0;
-    maxLiveNodeCnt = 0;
+    maxLiveNodeCount = 0;
 }
 
 void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource,
@@ -68,11 +68,11 @@ void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     minValue = 0;
 }
 
-void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     (void)typeSize;
     extraBuf = 0;
-    maxLiveNodeCnt = 0;
+    maxLiveNodeCount = 0;
 }
 
 void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource,
@@ -86,10 +86,10 @@ void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetLog2MaxTmpSize(inputSize, typeSize);
 }
 
-void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     auto calcFactor = (typeSize == sizeof(float)) ? LOG2_FLOAT_CALC_FAC : LOG2_HALF_CALC_FAC;
     extraBuf = 0;
-    maxLiveNodeCnt = calcFactor;
+    maxLiveNodeCount = calcFactor;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/round/round_tiling_impl.cpp b/impl/math/round/round_tiling_impl.cpp
index 760e226eda434845b54780b0040ba9faf77ee17e..98d6a5daa60f5f5257bd4a0b34162268d92dee77 100644
--- a/impl/math/round/round_tiling_impl.cpp
+++ b/impl/math/round/round_tiling_impl.cpp
@@ -47,22 +47,22 @@ void GetRoundMaxMinTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatf
 }
 
 void GetRoundTmpBufferFactorSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t typeSize,
-    uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+    uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
     platform_ascendc::SocVersion socVersion = ascendcPlatform.GetSocVersion();
     constexpr uint32_t liveNodeTwo = 2;
     if (socVersion == platform_ascendc::SocVersion::ASCEND910B) {
         if (typeSize == sizeof(float)) {
-            maxLiveNodeCnt = 0;
+            maxLiveNodeCount = 0;
         } else {
-            maxLiveNodeCnt = 1;
+            maxLiveNodeCount = 1;
         }
     } else if (socVersion == platform_ascendc::SocVersion::ASCEND310P) {
         if (typeSize == sizeof(float)) {
-            maxLiveNodeCnt = 1;
+            maxLiveNodeCount = 1;
         } else {
-            maxLiveNodeCnt = liveNodeTwo;
+            maxLiveNodeCount = liveNodeTwo;
         }
     }
 }
diff --git a/impl/math/sin/sin_common_impl.h b/impl/math/sin/sin_common_impl.h
index 432584b317108f86bd0e37acc23db138a35cae5e..8d125e1db3056eeb9ba1c1db12a856ad420e109b 100644
--- a/impl/math/sin/sin_common_impl.h
+++ b/impl/math/sin/sin_common_impl.h
@@ -263,7 +263,7 @@ __aicore__ inline void SinImpl(const LocalTensor<T>& dstTensor, const LocalTenso
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    SinImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 }  // namespace AscendC
 
diff --git a/impl/math/sin/sin_tiling_impl.cpp b/impl/math/sin/sin_tiling_impl.cpp
index 86870e202f4869e4f8224ea54ab1fca8d27789bb..b12b16c1c3613e2cd7e2e40373ac91e4f9c1e31e 100644
--- a/impl/math/sin/sin_tiling_impl.cpp
+++ b/impl/math/sin/sin_tiling_impl.cpp
@@ -25,10 +25,10 @@ constexpr uint32_t SIN_FLOAT_REUSE_CALC_FAC = 2;
 constexpr uint32_t SIN_ONE_REPEAT_BYTE_SIZE = 256;
 } // namespace
 
-void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf)
+void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? SIN_FLOAT_NOREUSE_CALC_FAC : SIN_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? SIN_FLOAT_NOREUSE_CALC_FAC : SIN_HALF_CALC_FAC;
 }
 
 inline uint32_t GetSinMaxTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource)
diff --git a/impl/math/sinh/sinh_common_impl.h b/impl/math/sinh/sinh_common_impl.h
index 9a0ab9a844a676548cf8a5b5bfb20caa0fe1e224..15fa0cbd5cb07d97c6fb047cb0130218b2f856d0 100644
--- a/impl/math/sinh/sinh_common_impl.h
+++ b/impl/math/sinh/sinh_common_impl.h
@@ -160,7 +160,7 @@ __aicore__ inline void SinhImpl(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    SinhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    SinhImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 } // namespace AscendC
 
diff --git a/impl/math/sinh/sinh_tiling_impl.cpp b/impl/math/sinh/sinh_tiling_impl.cpp
index 6474d1f9580e084ab60642cc29badf299b518f03..156319dad4231b9ec22ce24eb26346551b6341e2 100644
--- a/impl/math/sinh/sinh_tiling_impl.cpp
+++ b/impl/math/sinh/sinh_tiling_impl.cpp
@@ -48,9 +48,9 @@ void GetSinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
     maxValue = GetSinhMaxTmpSize(inputSize, typeSize);
 }
 
-void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? SINH_FLOAT_CALC_PROC : SINH_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? SINH_FLOAT_CALC_PROC : SINH_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/tan/tan_common_impl.h b/impl/math/tan/tan_common_impl.h
index cf413b7085be72a97e0a746d3923a8858480c5d9..1f2283224a44a438847ae0c4b4e9455d4abe5cda 100644
--- a/impl/math/tan/tan_common_impl.h
+++ b/impl/math/tan/tan_common_impl.h
@@ -332,7 +332,7 @@ __aicore__ inline void TanImpl(const LocalTensor<T>& dstTensor, const LocalTenso
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    TanImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 } // namespace AscendC
 
diff --git a/impl/math/tan/tan_tiling_impl.cpp b/impl/math/tan/tan_tiling_impl.cpp
index ffe5daf3b7e4658129255ec5440679df2273b0db..3bf548a6f6a959825672a9605a808819fe8abd70 100644
--- a/impl/math/tan/tan_tiling_impl.cpp
+++ b/impl/math/tan/tan_tiling_impl.cpp
@@ -48,9 +48,9 @@ void GetTanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
     maxValue = GetTanMaxTmpSize(inputSize, typeSize);
 }
 
-void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? TAN_FLOAT_CALC_FAC : TAN_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? TAN_FLOAT_CALC_FAC : TAN_HALF_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/tanh/tanh_common_impl.h b/impl/math/tanh/tanh_common_impl.h
index 798bebb11651b129ce8cc8fddbf1fadddf6eaac0..331ec4b9442ef4793480f368338ed60a8403efec 100644
--- a/impl/math/tanh/tanh_common_impl.h
+++ b/impl/math/tanh/tanh_common_impl.h
@@ -165,7 +165,7 @@ __aicore__ inline void TanhImpl(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    TanhImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 } // namespace AscendC
 
diff --git a/impl/math/tanh/tanh_tiling_impl.cpp b/impl/math/tanh/tanh_tiling_impl.cpp
index e96438f6e1741e492fad4715da46647c43120454..3854e5765fcabf9ab99a9dfe91abfad760d034fd 100644
--- a/impl/math/tanh/tanh_tiling_impl.cpp
+++ b/impl/math/tanh/tanh_tiling_impl.cpp
@@ -48,9 +48,9 @@ void GetTanhMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co
     maxValue = GetTanhMaxTmpSize(inputSize, typeSize);
 }
 
-void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf)
+void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? TANH_FLOAT_CALC_PROC : TANH_HALF_CALC_PROC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? TANH_FLOAT_CALC_PROC : TANH_HALF_CALC_PROC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/math/trunc/trunc_tiling_impl.cpp b/impl/math/trunc/trunc_tiling_impl.cpp
index 75c7577ed7bc42b57ebca2c5b174cda313324f51..bad7750af8a39a84c1c5aca2d303cf631a72c7ff 100644
--- a/impl/math/trunc/trunc_tiling_impl.cpp
+++ b/impl/math/trunc/trunc_tiling_impl.cpp
@@ -47,9 +47,9 @@ void GetTruncMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
     maxValue = GetTruncMaxTmpSize(inputSize, typeSize);
 }
 
-void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
+void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf)
 {
     extraBuf = 0;
-    maxLiveNodeCnt = (typeSize == sizeof(float)) ? TRUNC_FLOAT_CALC_FAC : TRUNC_HALF_CALC_FAC;
+    maxLiveNodeCount = (typeSize == sizeof(float)) ? TRUNC_FLOAT_CALC_FAC : TRUNC_HALF_CALC_FAC;
 }
 } // namespace AscendC
\ No newline at end of file
diff --git a/impl/normalization/layernormgrad/layernormgrad_common_impl.h b/impl/normalization/layernormgrad/layernormgrad_common_impl.h
index 1460d91c6ef8622f9fbb1bec935f8d2012a4e64e..b3cffd201dffa196bc072d47398f12019e9f8ff4 100644
--- a/impl/normalization/layernormgrad/layernormgrad_common_impl.h
+++ b/impl/normalization/layernormgrad/layernormgrad_common_impl.h
@@ -509,7 +509,7 @@ __aicore__ inline void LayerNormGradImpl(const LocalTensor<T> &outputPdX, const
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
 
     LayerNormGradImpl<T, isReuseSource>(outputPdX, resForGamma, inputDy, inputX, inputVariance, inputMean, inputGamma,
-        stackBuffer, epsilon, tiling);
+        stackBuffer, epsilon, tiling, shapeInfo);
 }
 } // namespace AscendC
 #endif // IMPL_NORMALIZATION_LAYERNORMGRAD_LAYERNORMGRAD_COMMON_IMPL_H
diff --git a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
index 244456092f3ea848a0fcbd09f77ffaa73af861e3..65abcb07ee617a4987efff7324b3e51489031dee 100644
--- a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
+++ b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp
@@ -38,7 +38,7 @@ void GetReduceXorSumMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t type
          inputSize);
     std::vector<int64_t> shapeDims = srcShape.GetDims();
     ASCENDC_HOST_ASSERT(shapeDims.size() > 0UL, return,
-        "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lli, expected is greater than 0!",
+        "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lu, expected is greater than 0!",
          shapeDims.size());
     ASCENDC_HOST_ASSERT(typeSize == 2U, return,
         "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2!", typeSize);
diff --git a/lib/activation/geglu_tiling.h b/lib/activation/geglu_tiling.h
index 742d6dda9cc688943c7702f34aef5fd286321b36..7284ad795a3c5c53531d4f1cbc3b7ed6fc5d39eb 100644
--- a/lib/activation/geglu_tiling.h
+++ b/lib/activation/geglu_tiling.h
@@ -33,13 +33,13 @@ void GetGeGLUMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
 /*!
  * \brief The calculation of the GeGLU interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_ACTIVATION_GEGLU_TILING_H
diff --git a/lib/activation/reglu.h b/lib/activation/reglu.h
index cd1abc2b6bca2e2732d8f8b71970b644631b8341..f166db2fff8b51aa7e19c36c3cab52ac43e7a123 100644
--- a/lib/activation/reglu.h
+++ b/lib/activation/reglu.h
@@ -36,7 +36,7 @@ __aicore__ inline void ReGlu(const LocalTensor<T>& dstTensor, const LocalTensor<
     if (g_coreType == AIC) {
         return;
     }
-    ReGluImpl<T, false>(dstTensor, srcTensor0, srcTensor1, sharedTmpBuffer, calCount);
+    ReGluImpl<T, isReuseSource>(dstTensor, srcTensor0, srcTensor1, sharedTmpBuffer, calCount);
 }
 
 /*
@@ -52,7 +52,7 @@ __aicore__ inline void ReGlu(const LocalTensor<T>& dstTensor, const LocalTensor<
     if (g_coreType == AIC) {
         return;
     }
-    ReGluImpl<T, false>(dstTensor, srcTensor0, srcTensor1, calCount);
+    ReGluImpl<T, isReuseSource>(dstTensor, srcTensor0, srcTensor1, calCount);
 }
 
 #pragma end_pipe
diff --git a/lib/activation/silu.h b/lib/activation/silu.h
index 482c1a9ed5d9d454786b343e56a2c2a6b8949ec4..ada158151869d1fb5da331a26244802a37011123 100644
--- a/lib/activation/silu.h
+++ b/lib/activation/silu.h
@@ -38,7 +38,7 @@ __aicore__ inline __inout_pipe__(V) void Silu(const LocalTensor<T> &dstLocal, co
     if ASCEND_IS_AIC {
         return;
     }
-    SiluCompute<T, false>(dstLocal, srcLocal, dataSize);
+    SiluCompute<T, isReuseSource>(dstLocal, srcLocal, dataSize);
 }
 
 }  // namespace AscendC
diff --git a/lib/activation/swiglu_tiling.h b/lib/activation/swiglu_tiling.h
index 0170fbd96df9b61da5fd517c41dc0abae68334b2..ef2bf1e652ba08607f025bdfc65d38d3a08614fe 100644
--- a/lib/activation/swiglu_tiling.h
+++ b/lib/activation/swiglu_tiling.h
@@ -35,11 +35,11 @@ void GetSwiGLUMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize,
 /*!
  * \brief The calculation of the SwiGLU interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetSwiGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/activation/swish.h b/lib/activation/swish.h
index edb2ea938be8f632e36fd25848c2b688f802e518..5ed05ba0abbb2995aa6115db62e72d73990934e0 100644
--- a/lib/activation/swish.h
+++ b/lib/activation/swish.h
@@ -38,7 +38,7 @@ __aicore__ inline __inout_pipe__(V) void Swish(
     if ASCEND_IS_AIC {
         return;
     }
-    SwishCompute<T, false>(dstLocal, srcLocal, dataSize, scalarValue);
+    SwishCompute<T, isReuseSource>(dstLocal, srcLocal, dataSize, scalarValue);
 }
 }  // namespace AscendC
 #endif  // LIB_SWISH_SWISH_H
\ No newline at end of file
diff --git a/lib/math/acos_tiling.h b/lib/math/acos_tiling.h
index bb3bb41bf2b1b0f5598a37f9e6200f9d00d96ee0..5997cff9e81d63747b58c6fbb6d7afb8f2435479 100644
--- a/lib/math/acos_tiling.h
+++ b/lib/math/acos_tiling.h
@@ -24,7 +24,7 @@ namespace AscendC {
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetAcosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/math/acosh_tiling.h b/lib/math/acosh_tiling.h
index a57ec81d85be1fa9087f5ac6cbe0ce8eb2c74c3a..1b1ee4f1b0319cef8ff189911c9067c0f5f411cf 100644
--- a/lib/math/acosh_tiling.h
+++ b/lib/math/acosh_tiling.h
@@ -34,13 +34,13 @@ void GetAcoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
 /*!
  * \brief The calculation of the Acosh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AcsendC
 #endif // LIB_MATH_ACOSH_TILING
\ No newline at end of file
diff --git a/lib/math/asin_tiling.h b/lib/math/asin_tiling.h
index 8e845953bb98062668e1847acaac13270737b57d..17fe9ecbcee1e687ec75833f2a62f875637111df 100644
--- a/lib/math/asin_tiling.h
+++ b/lib/math/asin_tiling.h
@@ -24,7 +24,7 @@ namespace AscendC {
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetAsinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/math/asinh_tiling.h b/lib/math/asinh_tiling.h
index a37360bc0040118b206dbd9233c7453f7e121541..b4edef07b082a6917179706e962333e4e1161703 100644
--- a/lib/math/asinh_tiling.h
+++ b/lib/math/asinh_tiling.h
@@ -33,13 +33,13 @@ void GetAsinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
 /*!
  * \brief The calculation of the Asinh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }
 #endif // LIB_MATH_ASINH_TILING_H
\ No newline at end of file
diff --git a/lib/math/atan.h b/lib/math/atan.h
index c25b39347a2d2240026e58aa06f8ab568aa29f67..f3e59887caa5cbfbfe7c9c32f48b14d2b431c2f2 100644
--- a/lib/math/atan.h
+++ b/lib/math/atan.h
@@ -70,7 +70,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Atan(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    AtanImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
  /*!
@@ -104,7 +104,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Atan(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const uint32_t calCount)
 {
-    AtanImpl(dstTensor, srcTensor, calCount);
+    AtanImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 #pragma end_pipe
 }  // namespace AscendC
diff --git a/lib/math/atan_tiling.h b/lib/math/atan_tiling.h
index 5e92c5ceffa6bddbc0517cc63add37f8a761da8e..43b715052afc3aca381fb748b31732bfd36e2647 100644
--- a/lib/math/atan_tiling.h
+++ b/lib/math/atan_tiling.h
@@ -34,13 +34,13 @@ void GetAtanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Atan interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_ATAN_TILING_H
\ No newline at end of file
diff --git a/lib/math/atanh.h b/lib/math/atanh.h
index 47ccd80dfeab07fd89e2ca01dbd934e7398fe474..44ddc13c3d9f6d0c70d8b5c51d137cf70747a767 100644
--- a/lib/math/atanh.h
+++ b/lib/math/atanh.h
@@ -45,7 +45,7 @@ __aicore__ inline void Atanh(const LocalTensor<T> &dstTensor,
         return;
     }
 
-    AtanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    AtanhImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /* !
diff --git a/lib/math/atanh_tiling.h b/lib/math/atanh_tiling.h
index 9a3926992c956edd74d0242a18676884ee07160b..96e3b2d78232597bfdf06daca115efc810ad168e 100644
--- a/lib/math/atanh_tiling.h
+++ b/lib/math/atanh_tiling.h
@@ -30,14 +30,14 @@ void GetAtanhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
 /*!
  * \brief The calculation of the Atanh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] ascendcPlatform, platform Information
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_ATANH_TILING_H
\ No newline at end of file
diff --git a/lib/math/axpy_tiling.h b/lib/math/axpy_tiling.h
index f6460291dfc7ae265a7e1d189d5d578dca76b2e3..60daae9cfcea5cf9cd86a4bf688f4ce847acd610 100644
--- a/lib/math/axpy_tiling.h
+++ b/lib/math/axpy_tiling.h
@@ -34,12 +34,12 @@ void GetAxpyMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Axpy interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  * \param [in] typeSize: size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt: the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount: the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf: the size of the extra temporary space
  */
-void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf);
+void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_AXPY_TILING_H
\ No newline at end of file
diff --git a/lib/math/ceil.h b/lib/math/ceil.h
index e0cfcab0733fe2fa7d1f9e1f56012216862effe4..2f61a244998b02d08de2dc20471e9b24647920cc 100644
--- a/lib/math/ceil.h
+++ b/lib/math/ceil.h
@@ -46,7 +46,7 @@ __aicore__ inline void Ceil(const LocalTensor<T> &dstTensor, const LocalTensor<T
         return;
     }
 #endif
-    CeilImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    CeilImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -65,7 +65,7 @@ __aicore__ inline void Ceil(const LocalTensor<T> &dstTensor, const LocalTensor<T
         return;
     }
 #endif
-    CeilImpl(dstTensor, srcTensor, calCount);
+    CeilImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 
 #pragma end_pipe
diff --git a/lib/math/ceil_tiling.h b/lib/math/ceil_tiling.h
index 79c0f8514492864bcde4b7d51b3e8739458a7f9e..aa822dce5c459460d1b3379e861085260c2b8522 100644
--- a/lib/math/ceil_tiling.h
+++ b/lib/math/ceil_tiling.h
@@ -38,9 +38,9 @@ void GetCeilMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }  // namespace AscendC
 #endif  // LIB_MATH_CEIL_TILING_H
diff --git a/lib/math/clamp.h b/lib/math/clamp.h
index 2e1c85bc85cd1473c38e5cce7dce47f419c69710..b1cfa6f5182c8fa8a1924055bb4b99dbc86b2016 100644
--- a/lib/math/clamp.h
+++ b/lib/math/clamp.h
@@ -45,7 +45,7 @@ __aicore__ inline void ClampMax(const LocalTensor<T>& dstTensor, const LocalTens
     if ASCEND_IS_AIC {
         return;
     }
-    ClampMaxImpl<T, false>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
+    ClampMaxImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
 }
  
 template <typename T, bool isReuseSource = false>
@@ -60,7 +60,7 @@ __aicore__ inline void ClampMax(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ret = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    ClampMaxImpl<T, false>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
+    ClampMaxImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
 }
 
 /* !
@@ -87,7 +87,7 @@ __aicore__ inline void ClampMin(const LocalTensor<T>& dstTensor, const LocalTens
     if ASCEND_IS_AIC {
         return;
     }
-    ClampMinImpl<T, false>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
+    ClampMinImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
 }
  
 template <typename T, bool isReuseSource = false>
@@ -102,7 +102,7 @@ __aicore__ inline void ClampMin(const LocalTensor<T>& dstTensor, const LocalTens
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ret = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    ClampMinImpl<T, false>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
+    ClampMinImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount);
 }
 #pragma end_pipe
 } // namespace AscendC
diff --git a/lib/math/clamp_tiling.h b/lib/math/clamp_tiling.h
index 675b7acf6489c1402810ebf8186b78329810c44a..3e6f4d1ce90bbf1b6a5a81424c258bf47364322e 100644
--- a/lib/math/clamp_tiling.h
+++ b/lib/math/clamp_tiling.h
@@ -38,9 +38,9 @@ void GetClampMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }
 #endif // LIB_MATH_CLAMP_TILING_H
\ No newline at end of file
diff --git a/lib/math/cos_tiling.h b/lib/math/cos_tiling.h
index 80ff4954c8a6416272a0db64f452abca1f50154c..4bb5381928d6b2a419a31d90e1c659a1914ddb83 100644
--- a/lib/math/cos_tiling.h
+++ b/lib/math/cos_tiling.h
@@ -34,13 +34,13 @@ void GetCosMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
 /*!
  * \brief The calculation of the Cos interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AcsendC
 #endif // LIB_MATH_COS_TILING_H
\ No newline at end of file
diff --git a/lib/math/cosh.h b/lib/math/cosh.h
index 035ce1d8a2a34cd2c4d253a93c1a9afaf0f83390..7d246be36245f6d6815d096123aca60383b3f223 100644
--- a/lib/math/cosh.h
+++ b/lib/math/cosh.h
@@ -65,7 +65,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Cosh(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    CoshImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -98,7 +98,7 @@ __aicore__ inline void Cosh(const LocalTensor<T>& dstTensor, const LocalTensor<T
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void Cosh(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor, const uint32_t calCount)
 {
-    CoshImpl(dstTensor, srcTensor, calCount);
+    CoshImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 #pragma end_pipe
 } // namespace AscendC
diff --git a/lib/math/cosh_tiling.h b/lib/math/cosh_tiling.h
index 58b173619accbe21741944db7d33b74b6d0b83e2..a89c628824a41b5fd10b2a723d4fda9e1d272b94 100644
--- a/lib/math/cosh_tiling.h
+++ b/lib/math/cosh_tiling.h
@@ -34,8 +34,8 @@ void GetCoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Cosh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
diff --git a/lib/math/erf_tiling.h b/lib/math/erf_tiling.h
index cd69dcdf708e309252c1750f9653fd7d2e3f0acd..f5887e46d5fbc612f678f6cb0e9dcf829dd20b16 100644
--- a/lib/math/erf_tiling.h
+++ b/lib/math/erf_tiling.h
@@ -24,7 +24,7 @@ namespace AscendC {
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetErfTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/math/erfc.h b/lib/math/erfc.h
index c82246ea02321cbc9f676ec22188eda13d319d0f..fac81c9df81c4dcf697a1bb2a0cc39cc1f53aab4 100644
--- a/lib/math/erfc.h
+++ b/lib/math/erfc.h
@@ -61,7 +61,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Erfc(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    ErfcImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -92,7 +92,7 @@ __aicore__ inline void Erfc(const LocalTensor<T>& dstTensor, const LocalTensor<T
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void Erfc(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor, const uint32_t calCount)
 {
-    ErfcImpl(dstTensor, srcTensor, calCount);
+    ErfcImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 
 /*!
diff --git a/lib/math/erfc_tiling.h b/lib/math/erfc_tiling.h
index e5cbd9480b8f21c7e7e8c5368db1d45b0b36499a..85560c2d75fe438ff1d2bd19ab9aa060b0371ec9 100644
--- a/lib/math/erfc_tiling.h
+++ b/lib/math/erfc_tiling.h
@@ -24,7 +24,7 @@ namespace AscendC {
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource,
@@ -40,6 +40,6 @@ void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
  * \param [out] maxValue, maximum temporary space required
  * \param [out] minValue, minimum temporary space required
  */
-void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_ERFC_TILING_H
\ No newline at end of file
diff --git a/lib/math/exp_tiling.h b/lib/math/exp_tiling.h
index 41f841f80fb4d48519d30db50b7f52c0fb133148..0fae539b6ebb3812f0ae3755003e7ed2aa38a133 100644
--- a/lib/math/exp_tiling.h
+++ b/lib/math/exp_tiling.h
@@ -22,8 +22,8 @@ namespace AscendC {
 /*!
  * \brief The calculation of the Exp interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  * \param [in] typeSize: size of the input data type, in bytes
  * \param [out] maxLiveNodeCount: the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer: the size of the extra temporary space
diff --git a/lib/math/floor_tiling.h b/lib/math/floor_tiling.h
index 9e0163ee48632e58307d091bbd8d1dcad1166918..60141f45cd775958a98a2ef2542c88ef521ed21d 100644
--- a/lib/math/floor_tiling.h
+++ b/lib/math/floor_tiling.h
@@ -38,9 +38,9 @@ void GetFloorMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }  // namespace AscendC
 #endif  // LIB_MATH_FLOOR_TILING_H
\ No newline at end of file
diff --git a/lib/math/fmod_tiling.h b/lib/math/fmod_tiling.h
index 7ebb76a1c56cf06d983fac48323938074d8f80dd..98db7f19e3aeb7ff9e607a8bfe14726f6c3b8c16 100644
--- a/lib/math/fmod_tiling.h
+++ b/lib/math/fmod_tiling.h
@@ -38,9 +38,9 @@ void GetFmodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_FMOD_TILING_H
\ No newline at end of file
diff --git a/lib/math/frac_tiling.h b/lib/math/frac_tiling.h
index b8260b3e29bb0019089e9249b04509726c00364f..c2a80d948657737cd754ad25b91f49f7b05a46cc 100644
--- a/lib/math/frac_tiling.h
+++ b/lib/math/frac_tiling.h
@@ -37,9 +37,9 @@ void GetFracMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_FRAC_TILING_H
\ No newline at end of file
diff --git a/lib/math/lgamma_tiling.h b/lib/math/lgamma_tiling.h
index 50370b11221bf30e0b4e3a32b3ef1a51b1753308..8327f84c8e521b94f12f279f4afad00f925b566f 100644
--- a/lib/math/lgamma_tiling.h
+++ b/lib/math/lgamma_tiling.h
@@ -22,11 +22,11 @@ namespace AscendC {
 /*!
  * \brief The calculation of the Lgamma interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetLgammaTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/math/log_tiling.h b/lib/math/log_tiling.h
index 99df3308a9cc8c734cafa288cb44cc66c37d5686..0a1175ac896539203b9a6b4918b32f55c9f4edd2 100644
--- a/lib/math/log_tiling.h
+++ b/lib/math/log_tiling.h
@@ -34,14 +34,14 @@ void GetLogMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
 /*!
  * \brief The calculation of the Log interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 
 /*!
  * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied.
@@ -59,14 +59,14 @@ void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
 /*!
  * \brief The calculation of the Log10 interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 
 /*!
  * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied.
@@ -84,13 +84,13 @@ void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Log2 interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_LOG_TILING
\ No newline at end of file
diff --git a/lib/math/power_tiling.h b/lib/math/power_tiling.h
index 9057c7e7f24e0024b0f0543a561e43b162b2d5eb..8ded75656a68f816a66e01a01b929347feda99f9 100644
--- a/lib/math/power_tiling.h
+++ b/lib/math/power_tiling.h
@@ -38,8 +38,8 @@ void GetPowerMaxMinTmpSize(const ge::Shape& srcShape1, const ge::Shape& srcShape
  * @ingroup GetPowerTmpBufferFactorSize for V220
  * @brief The calculation of the Round interface requires the developer to reserve or apply for temporary space. The
  *  relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- *  typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- *  This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ *  typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ *  This interface is used to obtain maxLiveNodeCount and extraBuf.
  * @param [in] baseIsTensor : whether base input is a Tensor
  * @param [in] expIsTensor : whether exponent input is a Tensor
  * @param [in] typeIsInt : whether the src type is int
diff --git a/lib/math/round.h b/lib/math/round.h
index d16c1acd673415c532e63e6ce61408aea8b796e0..9634af5870a95f5d0363dd1f18cb5f89a0e5f7a3 100644
--- a/lib/math/round.h
+++ b/lib/math/round.h
@@ -39,7 +39,7 @@ __aicore__ inline void Round(const LocalTensor<T> &dstTensor, const LocalTensor<
     if ASCEND_IS_AIC {
         return;
     }
-    RoundImpl<T, false>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    RoundImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -56,7 +56,7 @@ __aicore__ inline void Round(const LocalTensor<T> &dstTensor, const LocalTensor<
     if ASCEND_IS_AIC {
         return;
     }
-    RoundImpl<T, false>(dstTensor, srcTensor, calCount);
+    RoundImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 #pragma end_pipe
 } // namespace AscendC
diff --git a/lib/math/round_tiling.h b/lib/math/round_tiling.h
index 94b8a45d724b3e08084792cffaf7da1649275a73..e22c7a511af4c27b20b5f24fdff8d0b93579852d 100644
--- a/lib/math/round_tiling.h
+++ b/lib/math/round_tiling.h
@@ -35,15 +35,15 @@ void GetRoundMaxMinTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatf
 /*!
  * \brief The calculation of the Round interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] ascendcPlatform, platform Information
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetRoundTmpBufferFactorSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t typeSize,
-    uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+    uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }
 #endif // LIB_MATH_ROUND_TILING_INTF
\ No newline at end of file
diff --git a/lib/math/sign_tiling.h b/lib/math/sign_tiling.h
index 08d130503de326adc92e9aa389e1eb94eec66fb3..19da68254d466f2b860ea7ed7d806acf52501709 100644
--- a/lib/math/sign_tiling.h
+++ b/lib/math/sign_tiling.h
@@ -22,11 +22,11 @@ namespace AscendC {
 /*!
  * \brief The calculation of the Sign interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetSignTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/math/sin.h b/lib/math/sin.h
index 1b09b150a6fca8c7a8ba33d391fdb16e7db1686e..97378cf21d3f0ab9f43100a0df19d68a0a64c36b 100644
--- a/lib/math/sin.h
+++ b/lib/math/sin.h
@@ -60,7 +60,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Sin(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    SinImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
  /*!
@@ -100,7 +100,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Sin(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const uint32_t calCount)
 {
-    SinImpl(dstTensor, srcTensor, calCount);
+    SinImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 
  /*!
diff --git a/lib/math/sin_tiling.h b/lib/math/sin_tiling.h
index 211b2373c9d9896bdbc534284d2d323fceca93ef..329ddd0b2b19cea071f5b1faf36022b07b109c6d 100644
--- a/lib/math/sin_tiling.h
+++ b/lib/math/sin_tiling.h
@@ -34,13 +34,13 @@ void GetSinMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
 /*!
  * \brief The calculation of the Sin interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_SIN_TILING_H
\ No newline at end of file
diff --git a/lib/math/sinh_tiling.h b/lib/math/sinh_tiling.h
index 2f23d7e4ae3b5959eb02dec761f84bb93894d96c..0b02095517bbd7cfdb6236d2835ab5fe500c3f0e 100644
--- a/lib/math/sinh_tiling.h
+++ b/lib/math/sinh_tiling.h
@@ -34,13 +34,13 @@ void GetSinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Sinh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_SINH_TILING_H
\ No newline at end of file
diff --git a/lib/math/tan.h b/lib/math/tan.h
index 7fa5075398f7e47acbae17935175b357dfa650a2..8250adfc13d7aaf4fd3bb84c3a23fae7d6a3cbf1 100644
--- a/lib/math/tan.h
+++ b/lib/math/tan.h
@@ -93,7 +93,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Tan(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    TanImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -111,7 +111,7 @@ __aicore__ inline void Tan(const LocalTensor<T>& dstTensor, const LocalTensor<T>
 template <typename T, bool isReuseSource = false>
 __aicore__ inline void Tan(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor, const uint32_t calCount)
 {
-    TanImpl(dstTensor, srcTensor, calCount);
+    TanImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 
 /*!
diff --git a/lib/math/tan_tiling.h b/lib/math/tan_tiling.h
index 68f0c4c3b319c950ea5739e68ea78b51c961047a..f6506a5263f1f6cef9281bc0dc1719d476f2432e 100644
--- a/lib/math/tan_tiling.h
+++ b/lib/math/tan_tiling.h
@@ -35,13 +35,13 @@ void GetTanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con
 /*!
  * \brief The calculation of the Tan interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 }
 #endif // LIB_MATH_TAN_TILING_H
\ No newline at end of file
diff --git a/lib/math/tanh.h b/lib/math/tanh.h
index 06fb09d96143920a27f7c86030f9aa4c6cb4490c..ecea5f974ff63b483619ef457eb828e4532a4d36 100644
--- a/lib/math/tanh.h
+++ b/lib/math/tanh.h
@@ -65,7 +65,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Tanh(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const uint32_t calCount)
 {
-    TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount);
+    TanhImpl<T, isReuseSource>(dstTensor, srcTensor, sharedTmpBuffer, calCount);
 }
 
 /*!
@@ -99,7 +99,7 @@ template <typename T, bool isReuseSource = false>
 __aicore__ inline void Tanh(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const uint32_t calCount)
 {
-    TanhImpl(dstTensor, srcTensor, calCount);
+    TanhImpl<T, isReuseSource>(dstTensor, srcTensor, calCount);
 }
 #pragma end_pipe
 }  // namespace AscendC
diff --git a/lib/math/tanh_tiling.h b/lib/math/tanh_tiling.h
index 4d13550d0cfdfa4cbfcf7ac124ce2e785753b3a2..3930d84ac96b07badff3069b9d6acbea20017fa2 100644
--- a/lib/math/tanh_tiling.h
+++ b/lib/math/tanh_tiling.h
@@ -35,13 +35,13 @@ void GetTanhMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co
 /*!
  * \brief The calculation of the Tanh interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
  * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuffer, the size of the extra temporary space
  */
-void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf);
+void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_TANH_TILING_H
\ No newline at end of file
diff --git a/lib/math/trunc_tiling.h b/lib/math/trunc_tiling.h
index 1505220a238947b33c721a6a96079a5d632dfab9..6fa9185449c1bf4cd5f31af761000cf1e7c0c850 100644
--- a/lib/math/trunc_tiling.h
+++ b/lib/math/trunc_tiling.h
@@ -38,9 +38,9 @@ void GetTruncMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c
  * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
-void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf);
+void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf);
 } // namespace AscendC
 #endif // LIB_MATH_TRUNC_TILING_H
\ No newline at end of file
diff --git a/lib/math/xor.h b/lib/math/xor.h
index 8f3f4a2b2ecee48bb3aa8e0baeaf2c73367d610f..096f1829ce89dce039e4b6bda6b7d8d4cc95184a 100644
--- a/lib/math/xor.h
+++ b/lib/math/xor.h
@@ -48,7 +48,7 @@ __aicore__ inline void Xor(const LocalTensor<T>& dstTensor, const LocalTensor<T>
     static_assert((std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
         "Failed to check the data types, current api support data types are int16_t/uint16_t.");
 
-    XorImpl(dstTensor, src0Tensor, src1Tensor, sharedTmpBuffer, calCount);
+    XorImpl<T, isReuseSource>(dstTensor, src0Tensor, src1Tensor, sharedTmpBuffer, calCount);
 }
 /*
  * @brief Xor Computes the element-wise logical XOR of the given input tensors. Zeros are treated as False and nonzeros
diff --git a/lib/math/xor_tiling.h b/lib/math/xor_tiling.h
index 374e830f37affdc808ab1a5b22fdfc4a869724a3..8c2af82517a2977c692353af0b26d2d7772aede7 100644
--- a/lib/math/xor_tiling.h
+++ b/lib/math/xor_tiling.h
@@ -22,11 +22,11 @@ namespace AscendC {
 /*!
  * \brief The calculation of the Xor interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetXorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);
diff --git a/lib/reduce/mean.h b/lib/reduce/mean.h
index dbc87a136297fc3fc81daa06c611394f92670bd8..838d152205d16a3e4ea07a0f02b9cb19335f2f73 100644
--- a/lib/reduce/mean.h
+++ b/lib/reduce/mean.h
@@ -87,7 +87,7 @@ __aicore__ inline void Mean(
     LocalTensor<uint8_t> sharedTmpBuffer;
     bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
     ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
-    Mean<T, accType>(dstTensor, srcTensor, sharedTmpBuffer, meanParams);
+    Mean<T, accType, isReuseSource, isBasicBlock, reduceDim>(dstTensor, srcTensor, sharedTmpBuffer, meanParams);
 }
 #pragma end_pipe
 }  // namespace AscendC
diff --git a/lib/reduce/mean_tiling.h b/lib/reduce/mean_tiling.h
index 8133d1e92c2f12af306251144a6c810b17eacb61..b435338111e19dc13d823c8736a0be33822071c5 100644
--- a/lib/reduce/mean_tiling.h
+++ b/lib/reduce/mean_tiling.h
@@ -33,11 +33,11 @@ void GetMeanMaxMinTmpSize(const uint32_t n, const uint32_t srcTypeSize, const ui
 /*!
  * \brief The calculation of the Mean interface requires the developer to reserve or apply for temporary space. The
  * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x
- * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf
- * This interface is used to obtain maxLiveNodeCnt and extraBuf.
+ * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf
+ * This interface is used to obtain maxLiveNodeCount and extraBuf.
  *
  * \param [in] typeSize, size of the input data type, in bytes
- * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space
+ * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space
  * \param [out] extraBuf, the size of the extra temporary space
  */
 void GetMeanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);