From 16263da7f7c0f69d4b0f04166d48a9771c7a7b5c Mon Sep 17 00:00:00 2001
From: shinoda <zhuyuchen7@huawei.com>
Date: Sun, 11 May 2025 11:33:33 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20Cos=E3=80=81Dropout?=
 =?UTF-8?q?=E3=80=81Sum=E3=80=81ConfusionTranspose=20=E7=AB=AF=E5=88=B0?=
 =?UTF-8?q?=E7=AB=AF=E6=A0=B7=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/filter/dropout/CMakeLists.txt        |  82 +++++++++
 examples/filter/dropout/README.md             |  93 ++++++++++
 examples/filter/dropout/cmake/cpu_lib.cmake   |  26 +++
 examples/filter/dropout/cmake/npu_lib.cmake   |  12 ++
 examples/filter/dropout/main.cpp              | 167 ++++++++++++++++++
 .../dropout/op_host/dropout_custom_tiling.cpp |  34 ++++
 .../dropout/op_host/dropout_custom_tiling.h   |  24 +++
 .../dropout/op_kernel/dropout_custom.cpp      |  22 +++
 .../dropout/op_kernel/dropout_custom_impl.h   | 100 +++++++++++
 examples/filter/dropout/run.sh                |  58 ++++++
 examples/filter/dropout/scripts/gen_data.py   |  33 ++++
 examples/math/cos/CMakeLists.txt              |  82 +++++++++
 examples/math/cos/README.md                   |  90 ++++++++++
 examples/math/cos/cmake/cpu_lib.cmake         |  26 +++
 examples/math/cos/cmake/npu_lib.cmake         |  12 ++
 examples/math/cos/main.cpp                    | 153 ++++++++++++++++
 .../math/cos/op_host/cos_custom_tiling.cpp    |  18 ++
 examples/math/cos/op_host/cos_custom_tiling.h |  21 +++
 examples/math/cos/op_kernel/cos_custom.cpp    |  21 +++
 examples/math/cos/op_kernel/cos_custom_impl.h |  75 ++++++++
 examples/math/cos/run.sh                      |  58 ++++++
 examples/math/cos/scripts/gen_data.py         |  31 ++++
 examples/readme.md                            |  20 +++
 examples/reduce/sum/CMakeLists.txt            |  82 +++++++++
 examples/reduce/sum/README.md                 |  90 ++++++++++
 examples/reduce/sum/cmake/cpu_lib.cmake       |  26 +++
 examples/reduce/sum/cmake/npu_lib.cmake       |  12 ++
 examples/reduce/sum/main.cpp                  | 160 +++++++++++++++++
 .../reduce/sum/op_host/sum_custom_tiling.cpp  |  32 ++++
 .../reduce/sum/op_host/sum_custom_tiling.h    |  24 +++
 examples/reduce/sum/op_kernel/sum_custom.cpp  |  21 +++
 .../reduce/sum/op_kernel/sum_custom_impl.h    |  85 +++++++++
 examples/reduce/sum/run.sh                    |  58 ++++++
 examples/reduce/sum/scripts/gen_data.py       |  31 ++++
 .../confusion_transpose/CMakeLists.txt        |  82 +++++++++
 .../transpose/confusion_transpose/README.md   |  93 ++++++++++
 .../confusion_transpose/cmake/cpu_lib.cmake   |  26 +++
 .../confusion_transpose/cmake/npu_lib.cmake   |  12 ++
 .../transpose/confusion_transpose/main.cpp    | 146 +++++++++++++++
 .../confusion_transpose_custom_tiling.cpp     |  24 +++
 .../confusion_transpose_custom_tiling.h       |  45 +++++
 .../op_kernel/confusion_transpose_custom.cpp  |  32 ++++
 .../confusion_transpose_custom_impl.h         |  78 ++++++++
 examples/transpose/confusion_transpose/run.sh |  61 +++++++
 .../confusion_transpose/scripts/gen_data.py   |  39 ++++
 .../scripts/verify_data.py                    |  57 ++++++
 46 files changed, 2574 insertions(+)
 create mode 100644 examples/filter/dropout/CMakeLists.txt
 create mode 100644 examples/filter/dropout/README.md
 create mode 100644 examples/filter/dropout/cmake/cpu_lib.cmake
 create mode 100644 examples/filter/dropout/cmake/npu_lib.cmake
 create mode 100644 examples/filter/dropout/main.cpp
 create mode 100644 examples/filter/dropout/op_host/dropout_custom_tiling.cpp
 create mode 100644 examples/filter/dropout/op_host/dropout_custom_tiling.h
 create mode 100644 examples/filter/dropout/op_kernel/dropout_custom.cpp
 create mode 100644 examples/filter/dropout/op_kernel/dropout_custom_impl.h
 create mode 100644 examples/filter/dropout/run.sh
 create mode 100644 examples/filter/dropout/scripts/gen_data.py
 create mode 100644 examples/math/cos/CMakeLists.txt
 create mode 100644 examples/math/cos/README.md
 create mode 100644 examples/math/cos/cmake/cpu_lib.cmake
 create mode 100644 examples/math/cos/cmake/npu_lib.cmake
 create mode 100644 examples/math/cos/main.cpp
 create mode 100644 examples/math/cos/op_host/cos_custom_tiling.cpp
 create mode 100644 examples/math/cos/op_host/cos_custom_tiling.h
 create mode 100644 examples/math/cos/op_kernel/cos_custom.cpp
 create mode 100644 examples/math/cos/op_kernel/cos_custom_impl.h
 create mode 100644 examples/math/cos/run.sh
 create mode 100644 examples/math/cos/scripts/gen_data.py
 create mode 100644 examples/reduce/sum/CMakeLists.txt
 create mode 100644 examples/reduce/sum/README.md
 create mode 100644 examples/reduce/sum/cmake/cpu_lib.cmake
 create mode 100644 examples/reduce/sum/cmake/npu_lib.cmake
 create mode 100644 examples/reduce/sum/main.cpp
 create mode 100644 examples/reduce/sum/op_host/sum_custom_tiling.cpp
 create mode 100644 examples/reduce/sum/op_host/sum_custom_tiling.h
 create mode 100644 examples/reduce/sum/op_kernel/sum_custom.cpp
 create mode 100644 examples/reduce/sum/op_kernel/sum_custom_impl.h
 create mode 100644 examples/reduce/sum/run.sh
 create mode 100644 examples/reduce/sum/scripts/gen_data.py
 create mode 100644 examples/transpose/confusion_transpose/CMakeLists.txt
 create mode 100644 examples/transpose/confusion_transpose/README.md
 create mode 100644 examples/transpose/confusion_transpose/cmake/cpu_lib.cmake
 create mode 100644 examples/transpose/confusion_transpose/cmake/npu_lib.cmake
 create mode 100644 examples/transpose/confusion_transpose/main.cpp
 create mode 100644 examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp
 create mode 100644 examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h
 create mode 100644 examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp
 create mode 100644 examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h
 create mode 100644 examples/transpose/confusion_transpose/run.sh
 create mode 100644 examples/transpose/confusion_transpose/scripts/gen_data.py
 create mode 100644 examples/transpose/confusion_transpose/scripts/verify_data.py

diff --git a/examples/filter/dropout/CMakeLists.txt b/examples/filter/dropout/CMakeLists.txt
new file mode 100644
index 00000000..00cc3f66
--- /dev/null
+++ b/examples/filter/dropout/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/dropout_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(dropout_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/dropout_custom_tiling.cpp
+)
+
+target_compile_options(dropout_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(dropout_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(dropout_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(dropout_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS dropout_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/README.md b/examples/filter/dropout/README.md
new file mode 100644
index 00000000..acf91d50
--- /dev/null
+++ b/examples/filter/dropout/README.md
@@ -0,0 +1,93 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用 Dropout 高阶 API 实现 dropout 算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于 Dropout 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 数据过滤 > Dropout " 章节。
+
+- 直调：使用核函数直调 dropout 自定义算子。
+
+  核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas 推理系列产品AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+dropout 算子，根据MaskTensor对SrcTensor（源操作数，输入Tensor）进行过滤得到DstTensor（目的操作数、输出Tensor）。仅支持输入shape为ND格式。该过滤功能包括两种模式，字节模式和比特模式，本样例支持字节模式。
+
+- 算子规格：
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">DropoutCustom</td></tr>
+
+<tr><td rowspan="4" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">mask</td><td align="center">8*2048</td><td align="center">uint8_t</td><td align="center">ND</td></tr>
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">dropout_custom</td></tr>
+</table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[8][2048]，输出y[8][2048]的dropout算子。
+
+- kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Dropout高阶API接口完成dropout计算，得到最终结果，再搬出到外部存储上。
+
+  dropout算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行dropout计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- tiling实现
+
+  dropout算子的tiling实现流程如下：根据输入数据 x 及 mask 的总维度和所用核数量确定所需tiling参数，并调用GetDropOutMaxMinTmpSize接口获取tmpBuf的大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A3 训练系列产品/Atlas A3 推理系列产品
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+      - Atlas 推理系列产品AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/filter/dropout/cmake/cpu_lib.cmake b/examples/filter/dropout/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..693f15ac
--- /dev/null
+++ b/examples/filter/dropout/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/cmake/npu_lib.cmake b/examples/filter/dropout/cmake/npu_lib.cmake
new file mode 100644
index 00000000..8ad136f3
--- /dev/null
+++ b/examples/filter/dropout/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/filter/dropout/main.cpp b/examples/filter/dropout/main.cpp
new file mode 100644
index 00000000..202821ea
--- /dev/null
+++ b/examples/filter/dropout/main.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "./op_host/dropout_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_dropout_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm,
+                                                     DropoutCustomTilingData tiling);
+#endif
+
+constexpr uint32_t USED_CORE_NUM = 8;
+constexpr uint32_t TILINGDATA_SIZE = 5;
+constexpr uint32_t FIRST_AXIS = 8;
+constexpr uint32_t SRC_LAST_AXIS = 2048;
+
+extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis,
+                               const uint32_t maskLastAxis, const uint32_t coreNum);
+
+static bool CompareResult(const void *outputData, int64_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        printf("wrongNum: %ld\n", wrongNum);
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+    size_t inputSize = FIRST_AXIS * SRC_LAST_AXIS * sizeof(uint32_t);
+    size_t maskSize = FIRST_AXIS * SRC_LAST_AXIS;
+    size_t outputSize = inputSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *mask = (uint8_t *)AscendC::GmAlloc(maskSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+    ReadFile("../input/input_mask.bin", maskSize, mask, maskSize);
+
+    GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(dropout_custom, USED_CORE_NUM, x, y, mask,
+                *reinterpret_cast<DropoutCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)mask);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *maskHost, *tiling;
+    uint8_t *xDevice, *yDevice, *maskDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&maskHost), maskSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&maskDevice, maskSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+    ReadFile("../input/input_mask.bin", maskSize, maskHost, maskSize);
+
+    GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(maskDevice, maskSize, maskHost, maskSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(dropout_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, maskDevice, reinterpret_cast<DropoutCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(maskDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(maskHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.cpp b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp
new file mode 100644
index 00000000..1f306703
--- /dev/null
+++ b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "dropout_custom_tiling.h"
+#include "tiling/tiling_api.h"
+#include <vector>
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis,
+                        const uint32_t maskLastAxis, const uint32_t coreNum) {
+    std::vector<int64_t> srcDims = {firstAxis, srcLastAxis, maskLastAxis};
+
+    uint32_t typeSize = 4;
+    ge::Shape shape(srcDims);
+    uint32_t minValue{};
+    uint32_t maxValue{};
+
+    AscendC::GetDropOutMaxMinTmpSize(shape, typeSize, false, maxValue, minValue);
+
+    DropoutCustomTilingData *tiling = reinterpret_cast<DropoutCustomTilingData *>(tilingBuf);
+
+    tiling->firstAxis = firstAxis;
+    tiling->srcLastAxis = srcLastAxis;
+    tiling->maskLastAxis = maskLastAxis;
+
+    tiling->tmpBufferSize = minValue;
+    tiling->tileNum = coreNum;
+}
\ No newline at end of file
diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.h b/examples/filter/dropout/op_host/dropout_custom_tiling.h
new file mode 100644
index 00000000..ddf7470c
--- /dev/null
+++ b/examples/filter/dropout/op_host/dropout_custom_tiling.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
+#define EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
+
+#include <cstdint>
+
+struct DropoutCustomTilingData {
+    uint32_t firstAxis;
+    uint32_t srcLastAxis;
+    uint32_t maskLastAxis;
+    uint32_t tmpBufferSize;
+    uint32_t tileNum;
+};
+
+#endif // EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/filter/dropout/op_kernel/dropout_custom.cpp b/examples/filter/dropout/op_kernel/dropout_custom.cpp
new file mode 100644
index 00000000..7e9e3387
--- /dev/null
+++ b/examples/filter/dropout/op_kernel/dropout_custom.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "dropout_custom_impl.h"
+#include "kernel_operator.h"
+
+extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm,
+                                                     DropoutCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    KernelDropout<float> op;
+    op.Init(srcGm, dstGm, maskGm, tiling);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/filter/dropout/op_kernel/dropout_custom_impl.h b/examples/filter/dropout/op_kernel/dropout_custom_impl.h
new file mode 100644
index 00000000..d51237b9
--- /dev/null
+++ b/examples/filter/dropout/op_kernel/dropout_custom_impl.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_H
+#define EXAMPLES_FILTER_DROPOUT_CUSTOM_H
+
+#include "../op_host/dropout_custom_tiling.h"
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T> class KernelDropout {
+public:
+    __aicore__ inline KernelDropout() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR m, DropoutCustomTilingData tilingData) {
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
+        srcSize = tilingData.firstAxis * tilingData.srcLastAxis;
+        tileNum = tilingData.tileNum;
+        blockLength = srcSize / AscendC::GetBlockNum();
+        tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        info.firstAxis = tilingData.firstAxis / tilingData.tileNum;
+        info.srcLastAxis = tileLength;
+        info.maskLastAxis = tileLength;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        maskGm.SetGlobalBuffer((__gm__ uint8_t *)m + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe.InitBuffer(inQueueMask, BUFFER_NUM, this->tileLength * sizeof(uint8_t));
+        pipe.InitBuffer(outQueue, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe.InitBuffer(tmpBuf, tilingData.tmpBufferSize * sizeof(uint8_t));
+    }
+    __aicore__ inline void Process() {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> maskLocal = inQueueMask.AllocTensor<uint8_t>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(maskLocal, maskGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueMask.EnQue(maskLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<uint8_t> maskLocal = inQueueMask.DeQue<uint8_t>();
+
+        AscendC::LocalTensor<T> yLocal = outQueue.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpBuf.AllocTensor<uint8_t>();
+
+        AscendC::DropOut(yLocal, xLocal, maskLocal, sharedTmpBuffer, probValue, info);
+
+        outQueue.EnQue<T>(yLocal);
+
+        inQueueX.FreeTensor(xLocal);
+        inQueueMask.FreeTensor(maskLocal);
+        tmpBuf.FreeTensor(sharedTmpBuffer);
+    }
+    __aicore__ inline void CopyOut(int32_t progress) {
+        AscendC::LocalTensor<T> yLocal = outQueue.DeQue<T>();
+        AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
+        outQueue.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+    AscendC::GlobalTensor<uint8_t> maskGm;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueMask;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueue;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tmpBuf;
+
+    uint32_t srcSize = 0;
+    uint32_t tileNum = 0;
+    uint32_t blockLength = 0;
+    uint32_t tileLength = 0;
+    float probValue = 0.8;
+    AscendC::DropOutShapeInfo info;
+};
+
+#endif
\ No newline at end of file
diff --git a/examples/filter/dropout/run.sh b/examples/filter/dropout/run.sh
new file mode 100644
index 00000000..76ad17bb
--- /dev/null
+++ b/examples/filter/dropout/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./dropout_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./dropout_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./dropout_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/filter/dropout/scripts/gen_data.py b/examples/filter/dropout/scripts/gen_data.py
new file mode 100644
index 00000000..3413169d
--- /dev/null
+++ b/examples/filter/dropout/scripts/gen_data.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python3
+
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [8, 2048]
+
+    input_x = np.random.uniform(1, 100, input_shape).astype(dtype)
+    input_mask = np.random.randint(0, 2, size=input_shape, dtype=bool)
+    golden = np.where(input_mask, input_x / 0.8, np.zeros_like(input_x))
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    input_mask.tofile("./input/input_mask.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
\ No newline at end of file
diff --git a/examples/math/cos/CMakeLists.txt b/examples/math/cos/CMakeLists.txt
new file mode 100644
index 00000000..188a5881
--- /dev/null
+++ b/examples/math/cos/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/cos_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(cos_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/cos_custom_tiling.cpp
+)
+
+target_compile_options(cos_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(cos_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(cos_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(cos_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS cos_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/math/cos/README.md b/examples/math/cos/README.md
new file mode 100644
index 00000000..25894c25
--- /dev/null
+++ b/examples/math/cos/README.md
@@ -0,0 +1,90 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用 Cos 高阶 API 实现 cos 算子，并按照核函数直调的方式分别给出了对应的端到端实现。
+
+- 直调：使用核函数直调cos自定义算子。
+
+  核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas 推理系列产品AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+cos单算子，对输入tensor按元素做三角函数余弦运算。
+
+- 算子规格：
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">CosCustom</td></tr>
+
+<tr><td rowspan="3" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">8*2048</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">cos_custom</td></tr>
+</table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[8][2048]，输出y[8][2048]的cos算子。
+
+- kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Cos高阶API接口完成cos计算，得到最终结果，再搬出到外部存储上。
+
+  cos算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行cos计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- tiling实现
+
+  cos算子的tiling实现流程如下：根据输入长度totalLength和所用核数量coreNum确定所需tiling参数。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+      - Atlas 推理系列产品AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/math/cos/cmake/cpu_lib.cmake b/examples/math/cos/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..693f15ac
--- /dev/null
+++ b/examples/math/cos/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/math/cos/cmake/npu_lib.cmake b/examples/math/cos/cmake/npu_lib.cmake
new file mode 100644
index 00000000..8ad136f3
--- /dev/null
+++ b/examples/math/cos/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/math/cos/main.cpp b/examples/math/cos/main.cpp
new file mode 100644
index 00000000..4981c668
--- /dev/null
+++ b/examples/math/cos/main.cpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "op_host/cos_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_cos_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling);
+#endif
+
+constexpr uint32_t USED_CORE_NUM = 8;
+constexpr uint32_t TILINGDATA_SIZE = 2;
+constexpr uint32_t TOTAL_LENGTH = 8 * 2048;
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum);
+
+static bool CompareResult(const void *outputData, int64_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+    size_t inputSize = TOTAL_LENGTH * sizeof(uint32_t);
+    size_t outputSize = inputSize;
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(cos_custom, USED_CORE_NUM, x, y,
+                *reinterpret_cast<CosCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost, *tiling;
+    uint8_t *xDevice, *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(cos_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast<CosCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/math/cos/op_host/cos_custom_tiling.cpp b/examples/math/cos/op_host/cos_custom_tiling.cpp
new file mode 100644
index 00000000..352215a9
--- /dev/null
+++ b/examples/math/cos/op_host/cos_custom_tiling.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "cos_custom_tiling.h"
+#include "tiling/tiling_api.h"
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum) {
+    CosCustomTilingData *tiling = reinterpret_cast<CosCustomTilingData *>(tilingBuf);
+
+    tiling->totalLength = totalLength;
+    tiling->tileNum = coreNum;
+}
\ No newline at end of file
diff --git a/examples/math/cos/op_host/cos_custom_tiling.h b/examples/math/cos/op_host/cos_custom_tiling.h
new file mode 100644
index 00000000..1ba0775f
--- /dev/null
+++ b/examples/math/cos/op_host/cos_custom_tiling.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATH_COS_CUSTOM_TILING_H
+#define EXAMPLES_MATH_COS_CUSTOM_TILING_H
+
+#include <cstdint>
+
+struct CosCustomTilingData {
+    uint32_t totalLength;
+    uint32_t tileNum;
+};
+
+#endif // EXAMPLES_MATH_COS_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/math/cos/op_kernel/cos_custom.cpp b/examples/math/cos/op_kernel/cos_custom.cpp
new file mode 100644
index 00000000..10a86883
--- /dev/null
+++ b/examples/math/cos/op_kernel/cos_custom.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "cos_custom_impl.h"
+#include "kernel_operator.h"
+
+extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    MyCustomKernel::kernelCos<float> op;
+    op.Init(x, y, tiling);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/math/cos/op_kernel/cos_custom_impl.h b/examples/math/cos/op_kernel/cos_custom_impl.h
new file mode 100644
index 00000000..b6caf3d4
--- /dev/null
+++ b/examples/math/cos/op_kernel/cos_custom_impl.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATH_SORT_CUSTOM_H
+#define EXAMPLES_MATH_SORT_CUSTOM_H
+#include "../op_host/cos_custom_tiling.h"
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T> class kernelCos {
+public:
+    __aicore__ inline kernelCos() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, CosCustomTilingData tilingData) {
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
+        this->tileNum = tilingData.tileNum;
+        this->blockLength = tilingData.totalLength / AscendC::GetBlockNum();
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T));
+        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(T));
+    }
+    __aicore__ inline void Process() {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress) {
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+        AscendC::Cos<T, false>(yLocal, xLocal, this->tileLength);
+        outQueueY.EnQue<T>(yLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress) {
+        AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+        AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength);
+        outQueueY.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+
+    uint32_t tileNum = 0;
+    uint32_t blockLength = 0;
+    uint32_t tileLength = 0;
+};
+} // namespace MyCustomKernel
+
+#endif
\ No newline at end of file
diff --git a/examples/math/cos/run.sh b/examples/math/cos/run.sh
new file mode 100644
index 00000000..2f8c76a4
--- /dev/null
+++ b/examples/math/cos/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./cos_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./cos_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./cos_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/math/cos/scripts/gen_data.py b/examples/math/cos/scripts/gen_data.py
new file mode 100644
index 00000000..8c5886bf
--- /dev/null
+++ b/examples/math/cos/scripts/gen_data.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [8, 2048]
+    # generate value between [-65504, 65504]
+    input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype)
+    golden = np.cos(input_x).astype(dtype)
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/readme.md b/examples/readme.md
index d1ef2e05..b982b70c 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -117,6 +117,26 @@
         <td><a href="./normalization/welford_finalize"> welford_finalize </td>
         <td> Welford算法的后处理，一种在线计算均值和方差的方法。 </td>
     </tr>
+    <tr>
+        <th rowspan="1"><a href="./math"> math</th>
+        <td><a href="./math/cos"> cos</td>
+        <td> 对输入tensor做Cos计算。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./filter"> filter </th>
+        <td><a href="./filter/dropout"> dropout </td>
+        <td> 提供根据MaskTensor对SrcTensor（源操作数，输入Tensor）进行过滤的功能，得到DstTensor（目的操作数、输出Tensor）。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./reduce"> reduce </th>
+        <td><a href="./reduce/sum"> sum </td>
+        <td> 获取最后一个维度的元素总和。 </td>
+    </tr>
+    <tr>
+        <th rowspan="1"><a href="./transpose"> transpose </th>
+        <td><a href="./transpose/confusion_transpose"> confusion_transpose </td>
+        <td> 对输入数据进行数据排布及Reshape操作。 </td>
+    </tr>
     <tr>
         <th rowspan="1"><a href="./pad"> pad</th>
         <td><a href="./pad/broadcast"> broadcast</td>
diff --git a/examples/reduce/sum/CMakeLists.txt b/examples/reduce/sum/CMakeLists.txt
new file mode 100644
index 00000000..db2524fc
--- /dev/null
+++ b/examples/reduce/sum/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/sum_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(sum_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/sum_custom_tiling.cpp
+)
+
+target_compile_options(sum_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(sum_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(sum_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(sum_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS sum_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/README.md b/examples/reduce/sum/README.md
new file mode 100644
index 00000000..28be735a
--- /dev/null
+++ b/examples/reduce/sum/README.md
@@ -0,0 +1,90 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用 Sum 高阶 API 实现 sum 算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于 Sum 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 归约操作 > Sum " 章节。
+
+- 直调：使用核函数直调 sum 自定义算子。
+
+  核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+- Atlas 推理系列产品 AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+sum 算子，获取最后一个维度的元素总和。如果输入是向量，则在向量中对各元素相加；如果输入是矩阵，则沿最后一个维度对每行中元素求和。
+
+- 算子规格：
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">SumCustom</td></tr>
+
+<tr><td rowspan="3" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">7*2023</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">8*1</td><td align="center">float</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">sum_custom</td></tr>
+</table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[7][2023]，输出y[8]的sum算子，其中y中的有效值数量为7，最后一个为padding。
+
+- kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用Sum高阶API接口完成sum计算，得到最终结果，再搬出到外部存储上。
+
+  sum算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行sum计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- tiling实现
+
+  sum算子的tiling实现流程如下：根据输入数据 x 的内轴及内轴实际长度和外轴确定所需tiling参数例如输出内轴补齐后长度等，并调用GetSumMaxMinTmpSize接口获取tmpBuf的大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+      - Atlas 推理系列产品 AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/reduce/sum/cmake/cpu_lib.cmake b/examples/reduce/sum/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..693f15ac
--- /dev/null
+++ b/examples/reduce/sum/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/cmake/npu_lib.cmake b/examples/reduce/sum/cmake/npu_lib.cmake
new file mode 100644
index 00000000..8ad136f3
--- /dev/null
+++ b/examples/reduce/sum/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp
new file mode 100644
index 00000000..b1ecc6fc
--- /dev/null
+++ b/examples/reduce/sum/main.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#include "./op_host/sum_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_sum_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling);
+#endif
+
+constexpr uint32_t USED_CORE_NUM = 1;
+constexpr uint32_t TILINGDATA_SIZE = 5;
+constexpr uint32_t M = 7;    // 输入数据的外轴长度
+constexpr uint32_t N = 2023; // 输入数据内轴的实际元素个数
+
+extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N);
+
+static bool CompareResult(const void *outputData, int64_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        printf("wrongNum: %ld\n", wrongNum);
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    uint8_t *tiling = nullptr;
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost;
+    uint8_t *xDevice, *yDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize))
+#endif
+    GenerateTilingData(tiling, M, N);
+
+    auto tilingData = reinterpret_cast<SumCustomTilingData *>(tiling);
+
+    size_t inputSize = tilingData->outter * tilingData->inner * sizeof(uint32_t);
+    size_t outputSize = tilingData->out_inner * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode
+
+    ICPU_RUN_KF(sum_custom, USED_CORE_NUM, x, y,
+                *reinterpret_cast<SumCustomTilingData *>(tiling)); // use this macro for cpu debug
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    bool goldenResult = true;
+    goldenResult = CompareResult(y, outputSize);
+
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    // Copy host memory to device memory
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(sum_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast<SumCustomTilingData *>(tiling));
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Compare the result with the golden result
+    bool goldenResult = true;
+    goldenResult = CompareResult(yHost, outputSize);
+
+    // Clean up memory
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(tiling));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+
+    if (goldenResult) {
+        printf("test pass!\n");
+    } else {
+        printf("test failed!\n");
+    }
+    return 0;
+}
diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp
new file mode 100644
index 00000000..b0150f5a
--- /dev/null
+++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "sum_custom_tiling.h"
+#include "tiling/tiling_api.h"
+#include <vector>
+
+void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) {
+    uint32_t minValue{};
+    uint32_t maxValue{};
+
+    AscendC::GetSumMaxMinTmpSize(N, 4, false, maxValue, minValue);
+
+    SumCustomTilingData *tiling = reinterpret_cast<SumCustomTilingData *>(tilingBuf);
+
+    tiling->outter = M;
+    tiling->inner = (N * sizeof(uint32_t) + 32 - 1) / 32 * 32 / sizeof(uint32_t);
+    tiling->n = N;
+    tiling->tmpBufSize = minValue;
+
+    // 输出如果未遵循框架对内存开辟的要求(开辟内存的大小满足31Byte对齐)，即outter *
+    // sizeof(T)不是31Byte对齐时，需要向上进行32Byte对齐
+    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha001/API/ascendcopapi/atlasascendc_api_07_0826.html
+    tiling->out_inner = (M * sizeof(uint32_t) + 32 - 1) / 32 * 32 / sizeof(uint32_t);
+}
\ No newline at end of file
diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h
new file mode 100644
index 00000000..f952161a
--- /dev/null
+++ b/examples/reduce/sum/op_host/sum_custom_tiling.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
+#define EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
+
+#include <cstdint>
+
+struct SumCustomTilingData {
+    uint32_t inner;      // 输入内轴补齐后的长度
+    uint32_t outter;     // 输入外轴长度
+    uint32_t n;          // 输入内轴实际长度
+    uint32_t tmpBufSize; // 临时空间大小
+    uint32_t out_inner;  // 输出内轴补齐后的长度
+};
+
+#endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/reduce/sum/op_kernel/sum_custom.cpp b/examples/reduce/sum/op_kernel/sum_custom.cpp
new file mode 100644
index 00000000..ad2999b2
--- /dev/null
+++ b/examples/reduce/sum/op_kernel/sum_custom.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "sum_custom_impl.h"
+
+extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    KernelSum<float> op;
+    op.Init(srcGm, dstGm, tiling);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h
new file mode 100644
index 00000000..8ea2260a
--- /dev/null
+++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_REDUCE_SUM_COSTOM_H
+#define EXAMPLES_REDUCE_SUM_COSTOM_H
+
+#include "../op_host/sum_custom_tiling.h"
+#include "kernel_operator.h"
+
+template <typename T> class KernelSum {
+public:
+    __aicore__ inline KernelSum() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, SumCustomTilingData tilingData) {
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
+        inner = tilingData.inner;
+        outter = tilingData.outter;
+        n = tilingData.n;
+        tmpBufSize = tilingData.tmpBufSize;
+        out_inner = tilingData.out_inner;
+
+        params.inner = inner;
+        params.outter = outter;
+        params.n = n;
+
+        xGm.SetGlobalBuffer((__gm__ T *)x);
+        yGm.SetGlobalBuffer((__gm__ T *)y);
+        pipe.InitBuffer(inQueue, 1, inner * outter * sizeof(T));
+        pipe.InitBuffer(outQueue, 1, out_inner * sizeof(T));
+        pipe.InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t));
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn() {
+        AscendC::LocalTensor<T> xLocal = inQueue.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm, inner * outter);
+        inQueue.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<T> xLocal = inQueue.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueue.AllocTensor<T>();
+        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpBuf.AllocTensor<uint8_t>();
+
+        T scalar(0);
+        AscendC::Duplicate<T>(yLocal, scalar, out_inner);
+        AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params);
+
+        outQueue.EnQue<T>(yLocal);
+        inQueue.FreeTensor(xLocal);
+        tmpBuf.FreeTensor(sharedTmpBuffer);
+    }
+    __aicore__ inline void CopyOut() {
+        AscendC::LocalTensor<T> yLocal = outQueue.DeQue<T>();
+        AscendC::DataCopy(yGm, yLocal, out_inner);
+        outQueue.FreeTensor(yLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tmpBuf;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
+
+    uint32_t inner = 0;
+    uint32_t outter = 0;
+    uint32_t n = 0;
+    uint32_t tmpBufSize = 0;
+    uint32_t out_inner = 0;
+    AscendC::SumParams params;
+};
+
+#endif
\ No newline at end of file
diff --git a/examples/reduce/sum/run.sh b/examples/reduce/sum/run.sh
new file mode 100644
index 00000000..796c0fd7
--- /dev/null
+++ b/examples/reduce/sum/run.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./sum_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./sum_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./sum_direct_kernel_op
+fi
\ No newline at end of file
diff --git a/examples/reduce/sum/scripts/gen_data.py b/examples/reduce/sum/scripts/gen_data.py
new file mode 100644
index 00000000..ea012879
--- /dev/null
+++ b/examples/reduce/sum/scripts/gen_data.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float32
+
+    input_shape = [7, 2024]
+    input_x = np.random.uniform(-100, 100, input_shape).astype(dtype)
+    # 对 input_x 进行切片，排除每行最后一个元素（保留前 2023 个）
+    golden = np.sum(input_x[:, :-1], axis=1).astype(dtype)
+
+    os.system("mkdir -p ./input")
+    input_x.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/examples/transpose/confusion_transpose/CMakeLists.txt b/examples/transpose/confusion_transpose/CMakeLists.txt
new file mode 100644
index 00000000..ef7fd4a9
--- /dev/null
+++ b/examples/transpose/confusion_transpose/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/confusion_transpose_custom.cpp
+)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(confusion_transpose_direct_kernel_op
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/confusion_transpose_custom_tiling.cpp
+)
+
+target_compile_options(confusion_transpose_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(confusion_transpose_direct_kernel_op PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_include_directories(confusion_transpose_direct_kernel_op PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(confusion_transpose_direct_kernel_op PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+    graph_base
+)
+
+install(TARGETS confusion_transpose_direct_kernel_op
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/README.md b/examples/transpose/confusion_transpose/README.md
new file mode 100644
index 00000000..7f27f20a
--- /dev/null
+++ b/examples/transpose/confusion_transpose/README.md
@@ -0,0 +1,93 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+
+## 概述
+
+本样例介绍了调用 ConfusionTranspose 高阶 API 实现 confusionTranspose 算子，并按照核函数直调的方式分别给出了对应的端到端实现，关于 ConfusionTranspose 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 变形 > ConfusionTranspose " 章节。
+
+- 直调：使用核函数直调 confusionTranspose 自定义算子。
+
+  核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。
+
+## 样例支持的产品型号为：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+- Atlas 推理系列产品 AI Core
+
+## 目录结构
+
+| 目录                                                         | 描述                                                         
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [cmake](./cmake)                                             | 编译工程文件                                                  |
+| [op_host](./op_host)                                         | 本样例tiling代码实现                                          |
+| [op_kernel](./op_kernel)                                     | 本样例kernel侧代码实现                                        |
+| [scripts](./scripts)                                         | 包含输入数据和真值数据生成脚本文件                              |
+| CMakeLists.txt                                               | 编译工程文件                                                  |
+| main.cpp                                                     | 主函数，调用算子的应用程序，含CPU域及NPU域调用                   |
+| run.sh                                                       | 编译执行脚本                                                  |
+
+## 算子描述
+- 算子功能
+confusionTranspose 算子，对输入数据进行数据排布及Reshape操作。
+
+- 算子规格：
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ConfusionTransposeCustom</td></tr>
+
+<tr><td rowspan="3" align="center">算子输入</td></tr>
+<tr><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">1*2*64*32</td><td align="center">half</td><td align="center">ND</td></tr>
+<tr><td rowspan="2" align="center">算子输出</td></tr>
+<tr><td align="center">y</td><td align="center">1*64*2*32</td><td align="center">half</td><td align="center">ND</td></tr>
+
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">confusion_transpose_custom</td></tr>
+</table>
+
+## 算子实现介绍
+
+本样例中实现的是固定shape为输入x[1][2][64][32]，输出y[1][64][2][32]的confusionTranspose算子，针对NZ2ND场景，实现1、2轴互换。
+
+输入Tensor { shape:[B, N, H/N/16, S/16, 16, 16], origin_shape:[B, N, S, H/N], format:"NZ", origin_format:"ND"}
+输出Tensor { shape:[B, S, N, H/N], origin_shape:[B, S, N, H/N], format:"ND", origin_format:"ND"}
+
+- kernel实现
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用ConfusionTranspose高阶API接口完成confusionTranspose计算，得到最终结果，再搬出到外部存储上。
+
+  ConfusionTranspose算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中，Compute任务负责对srcLocal执行confusionTranspose计算，计算结果存储在dstLocal中，CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。
+
+- tiling实现
+
+  confusionTranspose算子的tiling实现流程如下：使用Ascend C提供 API GetConfusionTransposeTilingInfo，获取所需的Tiling参数，并调用GetConfusionTransposeMaxMinTmpSize接口获取tmpBuf的大小。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 生成输入和真值
+
+    执行如下命令后，当前目录生成input和output目录存放输入数据和真值数据。
+    ```
+    python3 scripts/gen_data.py
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION]
+    ```
+
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 
+      - Atlas 推理系列产品 AI Core
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..693f15ac
--- /dev/null
+++ b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake
@@ -0,0 +1,26 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/cmake/npu_lib.cmake b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake
new file mode 100644
index 00000000..8ad136f3
--- /dev/null
+++ b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake
@@ -0,0 +1,12 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/main.cpp b/examples/transpose/confusion_transpose/main.cpp
new file mode 100644
index 00000000..3fcf6b75
--- /dev/null
+++ b/examples/transpose/confusion_transpose/main.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "../../common/data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_confusion_transpose_custom.h"
+#include "tiling/platform/platform_ascendc.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling);
+#endif
+
+constexpr uint32_t USED_CORE_NUM = 1;
+constexpr uint32_t TILINGDATA_SIZE = 22;
+constexpr uint32_t B = 1;
+constexpr uint32_t N = 2;
+constexpr uint32_t S = 64;
+constexpr uint32_t hnDiv = 32;
+
+extern uint8_t *GenerateTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t HNDiv);
+
+static bool CompareResult(const void *outputData, int64_t outSize) {
+    void *goldenData;
+#ifdef ASCENDC_CPU_DEBUG
+    goldenData = (uint8_t *)AscendC::GmAlloc(outSize);
+#else
+    CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize));
+#endif
+    size_t goldenSize = outSize;
+    bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize);
+    if (ret) {
+        printf("ReadFile golden.bin success!\n");
+    } else {
+        printf("test failed!\n");
+        return false;
+    }
+    constexpr float EPS = 1e-4;
+    int64_t wrongNum = 0;
+
+    for (int i = 0; i < outSize / sizeof(float); i++) {
+        float a = (reinterpret_cast<const float *>(outputData))[i];
+        float b = (reinterpret_cast<const float *>(goldenData))[i];
+        float ae = std::abs(a - b);
+        float re = ae / abs(b);
+        if (ae > EPS && re > EPS) {
+            printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b);
+            wrongNum++;
+        }
+    }
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::GmFree((void *)goldenData);
+#else
+    CHECK_ACL(aclrtFreeHost(goldenData));
+#endif
+    if (wrongNum != 0) {
+        printf("wrongNum: %ld\n", wrongNum);
+        return false;
+    } else {
+        printf("CompareResult golden.bin success!\n");
+        return true;
+    }
+}
+
+int32_t main(int32_t argc, char *argv[]) {
+    size_t inputSize = B * S * N * hnDiv * sizeof(uint16_t);
+    size_t outputSize = B * S * N * hnDiv * sizeof(uint16_t);
+    size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize);
+    uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize);
+    uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize);
+
+    ReadFile("../input/input_x.bin", inputSize, x, inputSize);
+
+    uint8_t *buf = GenerateTiling(B, N, S, hnDiv);
+
+    memcpy_s(tiling, tilingSize, buf, tilingSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(confusion_transpose_custom, USED_CORE_NUM, x, y, tiling);
+
+    WriteFile("../output/output.bin", y, outputSize);
+
+    free(buf);
+    AscendC::GmFree((void *)x);
+    AscendC::GmFree((void *)y);
+    AscendC::GmFree((void *)tiling);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *xHost, *yHost;
+    uint8_t *xDevice, *yDevice, *tilingDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize));
+
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("../input/input_x.bin", inputSize, xHost, inputSize);
+
+    uint8_t *buf = GenerateTiling(B, N, S, hnDiv);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, buf, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    // Execute the kernel
+    ACLRT_LAUNCH_KERNEL(confusion_transpose_custom)
+    (USED_CORE_NUM, stream, xDevice, yDevice, tilingDevice);
+
+    // Wait for the stop event to complete
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    // Copy result to host memory and write to output file
+    CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, outputSize);
+
+    // Clean up memory
+    free(buf);
+
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp
new file mode 100644
index 00000000..64cd8736
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp
@@ -0,0 +1,24 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "confusion_transpose_custom_tiling.h"
+
+uint8_t *GetTilingBuf(optiling::ConfusionTransposeCustomTilingData *tilingData) {
+    uint32_t tilingSize = sizeof(optiling::ConfusionTransposeCustomTilingData);
+    uint8_t *buf = (uint8_t *)malloc(tilingSize);
+    tilingData->SaveToBuffer(buf, tilingSize);
+    return buf;
+}
+
+uint8_t *GenerateTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t HNDiv) {
+    optiling::ConfusionTransposeCustomTilingData tiling;
+    ComputeTiling(B, N, S, HNDiv, tiling);
+    return GetTilingBuf(&tiling);
+}
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h
new file mode 100644
index 00000000..396c7dbc
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
+#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(ConfusionTransposeCustomTilingData)
+    TILING_DATA_FIELD_DEF(uint32_t, B);
+    TILING_DATA_FIELD_DEF(uint32_t, N);
+    TILING_DATA_FIELD_DEF(uint32_t, S);
+    TILING_DATA_FIELD_DEF(uint32_t, HNDiv);
+    TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData);
+END_TILING_DATA_DEF;
+REGISTER_TILING_DATA_CLASS(ConfusionTransposeCustom, ConfusionTransposeCustomTilingData)
+} // namespace optiling
+
+void ComputeTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t hnDiv,
+                   optiling::ConfusionTransposeCustomTilingData &tiling) {
+    tiling.set_B(B);
+    tiling.set_N(N);
+    tiling.set_S(S);
+    tiling.set_HNDiv(hnDiv);
+
+    std::vector<int64_t> shapeVec = {B, N, S, hnDiv};
+    ge::Shape srcShape(shapeVec);
+
+    uint32_t maxValue = 0;
+    uint32_t minValue = 0;
+    AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, sizeof(uint16_t), 1, maxValue, minValue);
+    const uint32_t stackBufferSize = minValue;
+    AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, sizeof(uint16_t), 1, tiling.confusionTransposeTilingData); 
+}
+
+#endif // EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H
diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp
new file mode 100644
index 00000000..c5f01523
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "confusion_transpose_custom_impl.h"
+#include "kernel_operator.h"
+
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) {
+    if ASCEND_IS_AIC {
+        return;
+    }
+    MyCustomKernel::KernelConfusionTranspose<half> op;
+    MyCustomKernel::VecTiling tilingData;
+    CopyTiling(&tilingData, tiling);
+    op.Init(x, y, tilingData);
+    op.Process();
+}
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h
new file mode 100644
index 00000000..dc7bdbd8
--- /dev/null
+++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_COSTOM_H
+#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_COSTOM_H
+
+#include "kernel_operator.h"
+
+namespace MyCustomKernel {
+struct VecTiling {
+    uint32_t B;
+    uint32_t N;
+    uint32_t S;
+    uint32_t HNDiv;
+    ConfusionTransposeTiling confusionTransposeTilingData;
+};
+
+template <typename T> class KernelConfusionTranspose {
+public:
+    __aicore__ inline KernelConfusionTranspose() {}
+    __aicore__ inline void Init(__gm__ uint8_t *srcGm, __gm__ uint8_t *dstGm, const VecTiling &tilingData) {
+        this->B = tilingData.B;
+        this->N = tilingData.N;
+        this->S = tilingData.S;
+        this->hnDiv = tilingData.HNDiv;
+
+        srcGlobal.SetGlobalBuffer((__gm__ T *)srcGm, B * N * S * hnDiv);
+        dstGlobal.SetGlobalBuffer((__gm__ T *)dstGm, B * N * S * hnDiv);
+        pipe.InitBuffer(inQueueSrcVecIn, 1, B * N * S * hnDiv * sizeof(T));
+        pipe.InitBuffer(inQueueSrcVecOut, 1, B * N * S * hnDiv * sizeof(T));
+        this->tiling = tilingData.confusionTransposeTilingData;
+    }
+    __aicore__ inline void Process() {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn() {
+        AscendC::LocalTensor<T> srcLocal = inQueueSrcVecIn.AllocTensor<T>();
+        AscendC::DataCopy(srcLocal, srcGlobal, B * N * S * hnDiv);
+        inQueueSrcVecIn.EnQue(srcLocal);
+    }
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<T> srcLocal = inQueueSrcVecIn.DeQue<T>();
+        AscendC::LocalTensor<T> dstLocal = inQueueSrcVecOut.AllocTensor<T>();
+        AscendC::ConfusionTranspose(dstLocal, srcLocal, AscendC::TransposeType::TRANSPOSE_NZ2ND_0213, this->tiling);
+        inQueueSrcVecOut.EnQue<T>(dstLocal);
+        inQueueSrcVecIn.FreeTensor(srcLocal);
+    }
+    __aicore__ inline void CopyOut() {
+        AscendC::LocalTensor<T> dstLocal = inQueueSrcVecOut.DeQue<T>();
+        AscendC::DataCopy(dstGlobal, dstLocal, B * N * S * hnDiv);
+        inQueueSrcVecOut.FreeTensor(dstLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrcVecIn;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> inQueueSrcVecOut;
+    AscendC::GlobalTensor<T> srcGlobal;
+    AscendC::GlobalTensor<T> dstGlobal;
+    uint32_t B = 0;
+    uint32_t N = 0;
+    uint32_t S = 0;
+    uint32_t hnDiv = 0;
+    ConfusionTransposeTiling tiling;
+};
+} // namespace MyCustomKernel
+#endif
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/run.sh b/examples/transpose/confusion_transpose/run.sh
new file mode 100644
index 00000000..cddce28f
--- /dev/null
+++ b/examples/transpose/confusion_transpose/run.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+SHORT=r:,v:,
+LONG=run-mode:,soc-version:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+rm -rf build
+mkdir build
+cd build
+
+# in case of running op in simulator, use stub so instead
+if [ "${RUN_MODE}" = "sim" ]; then
+    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g')
+    export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH
+fi
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION}  -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    ./confusion_transpose_direct_kernel_op
+elif [ "${RUN_MODE}" = "sim" ]; then
+    export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+    export ASCEND_HOME_PATH=${ASCEND_HOME_DIR}
+    msprof op simulator --application=./confusion_transpose_direct_kernel_op
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    ./confusion_transpose_direct_kernel_op
+fi
+
+cd ..
+python3 scripts/verify_data.py output/output.bin output/golden.bin
diff --git a/examples/transpose/confusion_transpose/scripts/gen_data.py b/examples/transpose/confusion_transpose/scripts/gen_data.py
new file mode 100644
index 00000000..d323b2c6
--- /dev/null
+++ b/examples/transpose/confusion_transpose/scripts/gen_data.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import os
+import numpy as np
+
+def gen_golden_data_simple():
+    dtype = np.float16
+
+    B, N, S, hnDiv = 1, 2, 64, 32
+    
+    origin_shape = (B, N, S, hnDiv)
+
+    origin_data = np.random.uniform(-1000.0, 1000.0, origin_shape).astype(dtype)
+
+    reshape_shape = (B, N, S // 16, 16, hnDiv // 16, 16)
+    reshaped_data = origin_data.reshape(reshape_shape)
+
+    transposed_data = np.transpose(reshaped_data, (0, 1, 4, 2, 3, 5))
+
+    golden = np.transpose(origin_data, (0, 2, 1, 3))
+
+    os.system("mkdir -p ./input")
+    transposed_data.tofile("./input/input_x.bin")
+    os.system("mkdir -p ./output")
+    golden.tofile("./output/golden.bin")
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
\ No newline at end of file
diff --git a/examples/transpose/confusion_transpose/scripts/verify_data.py b/examples/transpose/confusion_transpose/scripts/verify_data.py
new file mode 100644
index 00000000..da072f53
--- /dev/null
+++ b/examples/transpose/confusion_transpose/scripts/verify_data.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python3
+
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+import sys
+import numpy as np
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
\ No newline at end of file
-- 
Gitee