diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e31108ae16cf7c722a9d758e17b90610ad940db7
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/CMakeLists.txt
@@ -0,0 +1,49 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v3.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v4.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf4099816e4b9d8a2a8ba52d67c30417ab12dd88
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/README.md
@@ -0,0 +1,96 @@
+## 目录结构介绍
+
+```
+├── KernelLaunch
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_tiling.h     // tiling结构体
+│   ├── add_custom_v1.cpp       // 算子kernel实现1：未优化前实现
+│   ├── add_custom_v2.cpp       // 算子kernel实现2：基于实现1，实现double buffer
+│   ├── add_custom_v3.cpp       // 算子kernel实现3：优化double buffer实现，简化判断逻辑，并使用LocalMemAllocator简化代码
+│   ├── add_custom_v4.cpp       // 算子kernel实现4：基于add_custom_v3，修改地址分配逻辑，消除bank冲突
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+
+## 代码实现介绍
+
+本样例中实现的是固定shape为72*4096的Add算子。
+
+- kernel实现
+
+  Add算子的数学表达式为：
+
+  ```
+  z = x + y
+  ```
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+
+  实现1：请参考[add_custom_v1.cpp](./add_custom_v1.cpp)，使用静态Tensor编程方法，进行add算子的编程。
+
+  实现2：请参考[add_custom_v2.cpp](./add_custom_v2.cpp)，优化性能，使用double buffer进行流水排布。
+
+  实现3：请参考[add_custom_v3.cpp](./add_custom_v3.cpp)，优化add_custom_v2中反向同步，替换为MTE2等待MTE3执行结束。减少分支判断的同时，算子性能因为double buffer的原因不受影响。另外使用LocalMemAllocator进行线性内存分配，Bank冲突不敏感场景可以使用这种方式简化分配。
+
+  实现4：请参考[add_custom_v4.cpp](./add_custom_v4.cpp)，基于add_custom_v3的实现，优化地址分配消除Bank冲突。 
+
+- 调用实现
+
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+
+- 打开样例目录
+  以命令行方式下载样例代码，master分支为例。
+
+  ```bash
+  cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch
+  ```
+- 配置环境变量
+
+  请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+
+  - 默认路径，root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    ```
+  - 默认路径，非root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    ```
+  - 指定路径install_path，安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+    ```
+- 样例执行
+
+  ```bash
+  bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+  ```
+
+  - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu /sim / npu]
+  - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+    - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+  示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+
+  ```bash
+  bash run.sh -r cpu -v Ascendxxxyy
+  ```
+
+## 更新说明
+
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/06 | 新增本readme |
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_tiling.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..278a6e336f07522ffd4c66ce1fb0640494607629
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_tiling.h
@@ -0,0 +1,17 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include <cstdint>
+
+struct AddCustomTilingData {
+    uint32_t singleCoreLength;
+};
+#endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..831ae4c3c6a9623281a6140810d8d3abb54974d8
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v1.cpp
@@ -0,0 +1,88 @@
+/**
+ * @file add_custom_v1.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAddV1 {
+public:
+    __aicore__ inline KernelAddV1() = default;
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
+    }
+    __aicore__ inline void Process()
+    {
+        AscendC::LocalTensor<float> xLocal(AscendC::TPosition::VECCALC, xAddr, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocal(AscendC::TPosition::VECCALC, yAddr, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocal(AscendC::TPosition::VECCALC, zAddr, TILE_LENGTH);
+
+        // one buffer
+        for (uint32_t i = 0; i < loopCount; i++) {
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+        }
+    }
+
+private:
+    static constexpr uint32_t xAddr = 0;
+    static constexpr uint32_t yAddr = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddr = TILE_LENGTH * sizeof(float) * 2;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    AscendC::InitSocState();
+    KernelAddV1 op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v1<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c4525d553c2ceb5b37f3f2044df2418c1c5a26b
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v2.cpp
@@ -0,0 +1,145 @@
+/**
+ * @file add_custom_v2.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAddV2 {
+public:
+    __aicore__ inline KernelAddV2() = default;
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
+    }
+    __aicore__ inline void Process()
+    {
+        // ping
+        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        // pong
+        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
+
+        // double buffer
+        for (uint32_t i = 0; i < loopCount / 2; i++) {
+            // ping part
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            AscendC::DataCopy(xLocalPing, xGm[2 * i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPing, yGm[2 * i * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+            AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
+            if (i != (loopCount / 2 - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::DataCopy(zGm[2 * i * TILE_LENGTH], zLocalPing, TILE_LENGTH);
+            if (i != (loopCount / 2 - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+
+            // pong part
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
+            }
+            AscendC::DataCopy(xLocalPong, xGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPong, yGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPong/yLocalPong in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID1);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID1);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
+            }
+            AscendC::Add(zLocalPong, xLocalPong, yLocalPong, TILE_LENGTH);
+            if (i != (loopCount / 2 - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPong in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
+            AscendC::DataCopy(zGm[(2 * i + 1) * TILE_LENGTH], zLocalPong, TILE_LENGTH);
+            if (i != (loopCount / 2 - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
+            }
+        }
+
+        // tail block
+        if (loopCount % 2 != 0) {
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the previous for loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            AscendC::DataCopy(xLocalPing, xGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPing, yGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the previous for loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::DataCopy(zGm[(loopCount - 1) * TILE_LENGTH], zLocalPing, TILE_LENGTH);
+        }
+    }
+
+private:
+    static constexpr uint32_t xAddrPing = 0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
+    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    AscendC::InitSocState();
+    KernelAddV2 op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v2<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d424b54f1336823fbdbcb5f6629a4a37f5541b96
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v3.cpp
@@ -0,0 +1,90 @@
+/**
+ * @file add_custom_v3.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAddV3 {
+public:
+    __aicore__ inline KernelAddV3() = default;
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
+    }
+
+    __aicore__ inline void Process()
+    {
+        // use local memory allocator to simplify memor allocation
+        AscendC::LocalMemAllocator<AscendC::Hardware::UB> ubAllocator;
+        // ping
+        AscendC::LocalTensor<float> xLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> yLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> zLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        // pong
+        AscendC::LocalTensor<float> xLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> yLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> zLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+
+        // double buffer
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+        for (uint32_t i = 0; i < loopCount; i++) {
+            int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1);
+            AscendC::LocalTensor<float> &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong);
+            AscendC::LocalTensor<float> &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong);
+            AscendC::LocalTensor<float> &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+        }
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+
+private:
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    AscendC::InitSocState();
+    KernelAddV3 op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v3<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad45b3b44fb055c33a89398a6f49d82e3820281d
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/add_custom_v4.cpp
@@ -0,0 +1,99 @@
+/**
+ * @file add_custom_v4.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+constexpr uint32_t DST_START_ADDRESS = 0x20000;
+}
+
+class KernelAddV4 {
+public:
+    __aicore__ inline KernelAddV4() = default;
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
+    }
+
+    __aicore__ inline void Process()
+    {
+        // ping
+        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        // pong
+        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
+
+        // double buffer
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+        for (uint32_t i = 0; i < loopCount; i++) {
+            int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1);
+            AscendC::LocalTensor<float> &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong);
+            AscendC::LocalTensor<float> &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong);
+            AscendC::LocalTensor<float> &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+        }
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+
+private:
+    // according to bank conflict rule:
+    // rr conflict happened when 2 read requests are in the same bank group
+    // rw conflict happened when read and write requests are in the same bank
+    // so we adjust the address to avoid bank conflicts
+    static constexpr uint32_t xAddrPing = 0x0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float) + 256;
+    static constexpr uint32_t zAddrPing = DST_START_ADDRESS;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 2 + 256;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 3 + 512;
+    static constexpr uint32_t zAddrPong = DST_START_ADDRESS + TILE_LENGTH * sizeof(float);
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    AscendC::InitSocState();
+    KernelAddV4 op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v4<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5362c8b5a53b1f730ac6fe542ee226a42dff40ff
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f92b095d1f4f258af274b98dfcba2dccf6165b30
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/data_utils.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/data_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d34457807dda2d0c79ce19eadc7fcbd4667ece2
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..caf5653e8d65d315d6221fb21f7725194cf5b50d
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/main.cpp
@@ -0,0 +1,148 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+using KernelEntry = void (*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *, uint8_t *);
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+using KernelEntry = void (*)(GM_ADDR, GM_ADDR, GM_ADDR, GM_ADDR);
+
+#endif
+
+struct ArgInfo {
+    std::string fileName;
+    size_t length;
+};
+
+#ifndef ASCENDC_CPU_DEBUG
+
+void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector<ArgInfo> &inputsInfo,
+    std::vector<ArgInfo> &outputsInfo, uint8_t *tiling)
+{
+    std::vector<uint8_t *> inputHost(inputsInfo.size());
+    std::vector<uint8_t *> inputDevice(inputsInfo.size());
+    std::vector<uint8_t *> outputHost(outputsInfo.size());
+    std::vector<uint8_t *> outputDevice(outputsInfo.size());
+    uint8_t *tilingDevice;
+
+    CHECK_ACL(aclrtMalloc((void **)(&tilingDevice), sizeof(AddCustomTilingData), ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(
+        tilingDevice, sizeof(AddCustomTilingData), tiling, sizeof(AddCustomTilingData), ACL_MEMCPY_HOST_TO_DEVICE));
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length);
+        CHECK_ACL(aclrtMemcpy(
+            inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0], tilingDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtFree(tilingDevice));
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMemcpy(
+            outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length, ACL_MEMCPY_DEVICE_TO_HOST));
+        WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length);
+        CHECK_ACL(aclrtFree(outputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(outputHost[i]));
+    }
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtFree(inputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(inputHost[i]));
+    }
+}
+
+#else
+
+#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo, tiling)                          \
+    do {                                                                                            \
+        std::vector<uint8_t *> input(inputsInfo.size());                                            \
+        std::vector<uint8_t *> output(outputsInfo.size());                                          \
+                                                                                                    \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length);                           \
+            ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length);                         \
+        }                                                                                           \
+                                                                                                    \
+        AscendC::SetKernelMode(KernelMode::AIV_MODE);                                               \
+        ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0], tiling);                  \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            AscendC::GmFree((void *)input[i]);                                                      \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length);                   \
+            AscendC::GmFree((void *)output[i]);                                                     \
+        }                                                                                           \
+    } while (0)
+
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 8;
+    // set data length, in this case we use 8 cores and length of each core is 4096 * 9
+    uint32_t dataLen = 4096 * 9 * blockDim;
+    size_t inputByteSize = dataLen * sizeof(float);
+    size_t outputByteSize = dataLen * sizeof(float);
+    AddCustomTilingData tiling;
+    tiling.singleCoreLength = dataLen / blockDim;
+
+    std::vector<ArgInfo> inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}};
+    std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV3Info = {{"./output/output_z_v3.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV4Info = {{"./output/output_z_v4.bin", outputByteSize}};
+
+#ifndef ASCENDC_CPU_DEBUG
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v3, blockDim, stream, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v4, blockDim, stream, inputsInfo, outputsV4Info, (uint8_t *)&tiling);
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#else
+    KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v3, blockDim, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v4, blockDim, inputsInfo, outputsV4Info, (uint8_t *)&tiling);
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6c691801e909f79a5b61fa41e6d632d6ec308191
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/run.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --launch-count=4 --output=./prof ./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --launch-count=4 --output=./prof ./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+# tidy folder by delete log files
+if [ "${RUN_MODE}" = "sim" ]; then
+    rm -f *.log *.dump *.vcd *.toml *_log
+fi
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v3.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v4.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f7ccb5b94e8dcbf285c6000e0418a2ee7f22e8
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32)
+    input_y = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32)
+    golden = (input_x + input_y).astype(np.float32)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5019f30fdf1e34188f6f777e5ef3e4aad3491c2
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/KernelLaunch/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-4
+absolute_tol = 1e-5
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6a82a0b1039e68c804b7662f607ea2b8522b01a
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernellaunch/README.md
@@ -0,0 +1,69 @@
+## 概述
+
+本样例介绍基于静态Tensor方式编程的场景下Add算子的实现方法，并提供核函数直调方法。
+
+## 目录结构介绍
+
+```
+├── 23_static_tensor_programming_kernellaunch      // 使用核函数直调的方式调用Add自定义算子
+│   └── KernelLaunch                               // Kernel Launch方式调用核函数样例
+```
+
+## 算子描述
+
+算子实现的是固定shape为72×4096的Add算子。
+
+Add的计算公式为：
+
+```python
+z = x + y
+```
+
+- x：输入，形状为\[72, 4096]，数据类型为float；
+- y：输入，形状为\[72, 4096]，数据类型为float；
+- z：输出，形状为\[72, 4096]，数据类型为float；
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_v1 / add_custom_v2 / add_custom_v3 / add_custom_v4</td></tr>
+</table>
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 编译运行样例算子
+
+针对自定义算子工程，编译运行包含如下步骤：
+
+- 编译自定义算子工程；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+
+### 1. 获取源码包
+
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 编译运行样例工程
+
+- [KernelLaunch样例运行](./KernelLaunch/README.md)
+
+## 更新说明
+
+
+| 时间       | 更新事项         |
+| ---------- | ---------------- |
+| 2025/09/06 | 新增直调方式样例 |
diff --git a/operator/ascendc/0_introduction/README.md b/operator/ascendc/0_introduction/README.md
index 2f95f076d558c2eb0a59fca318a10fb2b1910e26..44a722d77609bdc2bc4063125b904e61b561d38a 100644
--- a/operator/ascendc/0_introduction/README.md
+++ b/operator/ascendc/0_introduction/README.md
@@ -37,6 +37,7 @@
 | [20_mmad_kernellaunch](./20_mmad_kernellaunch) | 基于Ascend C基础API的Matmul自定义Cube算子及KernelLaunch调用样例 | Atlas 推理系列产品AI Core<br>Atlas A2训练系列产品/Atlas 800I A2推理产品 |
 | [21_vectoradd_kernellaunch](./21_vectoradd_kernellaunch) | 基于Ascend C的Add多场景自定义Vector算子的KernelLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 | [22_baremix_kernellaunch](./22_baremix_kernellaunch) | 通过更底层的编码方式，实现MatmulLeayrelu融合算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
+| [23_static_tensor_programming_kernellaunch](./23_static_tensor_programming_kernellaunch) | 通过静态Tensor编程方式，实现Add算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 
 ## 获取样例代码<a name="codeready"></a>