From afda895226cd2f8ad83fa82e4dfefe8cd0817447 Mon Sep 17 00:00:00 2001
From: "zhanghao (AS)" <z00437235@china.huawei.com>
Date: Fri, 22 Aug 2025 16:27:00 +0800
Subject: [PATCH 01/13] add static_tensor_programming testcases

---
 .../KernelLaunch/CMakeLists.txt               |  47 ++++
 .../KernelLaunch/README.md                    |  88 ++++++++
 .../KernelLaunch/add_custom_v1.cpp            |  85 ++++++++
 .../KernelLaunch/add_custom_v2.cpp            |  90 ++++++++
 .../KernelLaunch/cmake/cpu_lib.cmake          |   9 +
 .../KernelLaunch/cmake/npu_lib.cmake          |  11 +
 .../KernelLaunch/data_utils.h                 | 203 ++++++++++++++++++
 .../KernelLaunch/main.cpp                     | 127 +++++++++++
 .../KernelLaunch/run.sh                       | 113 ++++++++++
 .../KernelLaunch/scripts/gen_data.py          |  25 +++
 .../KernelLaunch/scripts/verify_result.py     |  53 +++++
 .../README.md                                 |  69 ++++++
 12 files changed, 920 insertions(+)
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
new file mode 100644
index 000000000..392189fe1
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
@@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
new file mode 100644
index 000000000..f72b521cd
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
@@ -0,0 +1,88 @@
+## 目录结构介绍
+
+```
+├── KernelLaunch
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_v1.cpp       // 算子kernel实现1：未优化前实现
+│   ├── add_custom_v2.cpp       // 算子kernel实现2：优化地址分配，消除Bank冲突后的实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+
+## 代码实现介绍
+
+本样例中实现的是固定shape为1*4096的Add算子。
+
+- kernel实现
+
+  Add算子的数学表达式为：
+
+  ```
+  z = x + y
+  ```
+
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+
+  实现1：请参考[add_custom_v1.cpp](./add_custom_v1.cpp)，xLocal地址为0，yLocal地址为0x4000，zLocal地址为0x8000。xLocal与yLocal存在读读冲突，xLocal与zLocal存在读写冲突。
+
+  实现2：请参考[add_custom_v2.cpp](./add_custom_v2.cpp)，为了避免Bank冲突，通过配置InitBuffer时的bufferSize来调整Tensor地址，xLocal地址为0，yLocal地址为0x4100，zLocal地址为0x10000。
+- 调用实现
+
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+
+- 打开样例目录
+  以命令行方式下载样例代码，master分支为例。
+
+  ```bash
+  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch
+  ```
+- 配置环境变量
+
+  请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+
+  - 默认路径，root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    ```
+  - 默认路径，非root用户安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    ```
+  - 指定路径install_path，安装CANN软件包
+    ```bash
+    export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+    ```
+- 样例执行
+
+  ```bash
+  bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+  ```
+
+  - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu /sim / npu]
+  - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+    - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+  示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+
+  ```bash
+  bash run.sh -r cpu -v Ascendxxxyy
+  ```
+
+## 更新说明
+
+
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/01 | 新增本readme |
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
new file mode 100644
index 000000000..471821d26
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -0,0 +1,85 @@
+/**
+ * @file add_custom_v1.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TOTAL_LENGTH = 4096 * 16;                            // total length of data
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
+    }
+    __aicore__ inline void Process()
+    {
+        uint32_t loopCount = TOTAL_LENGTH / TILE_LENGTH;
+        AscendC::LocalTensor<float> xLocal(AscendC::TPosition::VECCALC, xAddr, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocal(AscendC::TPosition::VECCALC, yAddr, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocal(AscendC::TPosition::VECCALC, zAddr, TILE_LENGTH);
+
+        for (int i = 0; i < loopCount; i++) {
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+        }
+    }
+
+private:
+    static constexpr uint32_t xAddr = 0;
+    static constexpr uint32_t yAddr = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddr = TILE_LENGTH * sizeof(float) * 2;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom_v1<<<blockDim, nullptr, stream>>>(x, y, z);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
new file mode 100644
index 000000000..65e7dd7e5
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -0,0 +1,90 @@
+/**
+ * @file add_custom_v2.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr int32_t TOTAL_LENGTH = 4096;        // total length of data
+constexpr int32_t BUFFER_NUM = 1;             // tensor num for each queue
+constexpr int32_t BANKGROUP_SIZE = 1024 * 64; // one bank size is 4KB, with 16 banks
+constexpr int32_t ONE_REPEAT_SIZE = 256;      // 256 bytes per repeat
+} // namespace
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
+        // xLocal size add 256 to avoid rr conflict
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE);
+        // yLocal size adjust to 64KB - xLocal size to avoid rw conflict
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, BANKGROUP_SIZE - (TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
+        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute()
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+{
+    add_custom_v2<<<blockDim, nullptr, stream>>>(x, y, z);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake
new file mode 100644
index 000000000..5362c8b5a
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake
new file mode 100644
index 000000000..f92b095d1
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake
@@ -0,0 +1,11 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
new file mode 100644
index 000000000..09d906371
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
new file mode 100644
index 000000000..8a65f8fa6
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
@@ -0,0 +1,127 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
+using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *);
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+using KernelEntry = void(*)(GM_ADDR, GM_ADDR, GM_ADDR);
+
+#endif
+
+struct ArgInfo {
+    std::string fileName;
+    size_t length;
+};
+
+#ifndef ASCENDC_CPU_DEBUG
+
+void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector<ArgInfo> &inputsInfo,
+                std::vector<ArgInfo> &outputsInfo)
+{
+    std::vector<uint8_t *> inputHost(inputsInfo.size());
+    std::vector<uint8_t *> inputDevice(inputsInfo.size());
+    std::vector<uint8_t *> outputHost(outputsInfo.size());
+    std::vector<uint8_t *> outputDevice(outputsInfo.size());
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length);
+        CHECK_ACL(aclrtMemcpy(inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length,
+                              ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length));
+        CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0]);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+    for (uint32_t i = 0; i < outputsInfo.size(); i++) {
+        CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length,
+                              ACL_MEMCPY_DEVICE_TO_HOST));
+        WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length);
+        CHECK_ACL(aclrtFree(outputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(outputHost[i]));
+    }
+
+    for (uint32_t i = 0; i < inputsInfo.size(); i++) {
+        CHECK_ACL(aclrtFree(inputDevice[i]));
+        CHECK_ACL(aclrtFreeHost(inputHost[i]));
+    }
+}
+
+#else
+
+#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo)                                  \
+    {                                                                                               \
+        std::vector<uint8_t *> input(inputsInfo.size());                                            \
+        std::vector<uint8_t *> output(outputsInfo.size());                                          \
+                                                                                                    \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length);                           \
+            ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length);                         \
+        }                                                                                           \
+                                                                                                    \
+        AscendC::SetKernelMode(KernelMode::AIV_MODE);                                               \
+        ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0]);                          \
+        for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
+            AscendC::GmFree((void *)input[i]);                                                      \
+        }                                                                                           \
+                                                                                                    \
+        for (uint32_t i = 0; i < outputsInfo.size(); i++) {                                         \
+            WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length);                   \
+            AscendC::GmFree((void *)output[i]);                                                     \
+        }                                                                                           \
+    }
+
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 1;
+    uint32_t dataLen = 4096;
+    size_t inputByteSize = dataLen * sizeof(float);
+    size_t outputByteSize = dataLen * sizeof(float);
+
+    std::vector<ArgInfo> inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}};
+    std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}};
+
+#ifndef ASCENDC_CPU_DEBUG
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info);
+    KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info);
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#else
+    KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info);
+    KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info);
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
new file mode 100644
index 000000000..10a011174
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+SOC_VERSION="Ascend310P3"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+echo "Current compile soc version is ${SOC_VERSION}"
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [ "${RUN_MODE}" = "npu" ]; then
+        msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit
+    elif [ "${RUN_MODE}" = "sim" ]; then
+        msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit
+    elif [ "${RUN_MODE}" = "cpu" ]; then
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
new file mode 100644
index 000000000..86bbba89d
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
+    input_y = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
+    golden = (input_x + input_y).astype(np.float32)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
new file mode 100644
index 000000000..6a38a3b2b
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-4
+absolute_tol = 1e-5
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
new file mode 100644
index 000000000..4ea8f29b8
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
@@ -0,0 +1,69 @@
+## 概述
+
+本样例介绍基于静态Tensor方式编程的场景下Add算子的实现方法，并提供核函数直调方法。
+
+## 目录结构介绍
+
+```
+├── 23_static_tensor_programming_kernel_launch      // 使用核函数直调的方式调用Add自定义算子
+│   └── KernelLaunch     // Kernel Launch方式调用核函数样例
+```
+
+## 算子描述
+
+算子实现的是固定shape为1×4096的Add算子。
+
+Add的计算公式为：
+
+```python
+z = x + y
+```
+
+- x：输入，形状为\[1, 4096]，数据类型为float；
+- y：输入，形状为\[1, 4096]，数据类型为float；
+- z：输出，形状为\[1, 4096]，数据类型为float；
+
+## 算子规格描述
+
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_v1 / add_custom_v2</td></tr>
+</table>
+
+## 支持的产品型号
+
+本样例支持如下产品型号：
+
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 编译运行样例算子
+
+针对自定义算子工程，编译运行包含如下步骤：
+
+- 编译自定义算子工程；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+
+### 1. 获取源码包
+
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 编译运行样例工程
+
+- [KernelLaunch样例运行](./KernelLaunch/README.md)
+
+## 更新说明
+
+
+| 时间       | 更新事项         |
+| ---------- | ---------------- |
+| 2025/07/01 | 新增直调方式样例 |
-- 
Gitee


From dfa886a322e104e497c882f8b1ebd96cc5614488 Mon Sep 17 00:00:00 2001
From: "zhanghao (AS)" <z00437235@china.huawei.com>
Date: Tue, 26 Aug 2025 14:03:57 +0800
Subject: [PATCH 02/13] add cases for low level programming

---
 .../KernelLaunch/CMakeLists.txt               |   1 +
 .../KernelLaunch/add_custom_tiling.h          |  17 ++
 .../KernelLaunch/add_custom_v1.cpp            |  28 ++--
 .../KernelLaunch/add_custom_v2.cpp            | 156 ++++++++++++------
 .../KernelLaunch/add_custom_v3.cpp            |  93 +++++++++++
 .../KernelLaunch/main.cpp                     |  45 +++--
 .../KernelLaunch/run.sh                       |   4 +-
 7 files changed, 264 insertions(+), 80 deletions(-)
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
index 392189fe1..ab71257d0 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
@@ -18,6 +18,7 @@ endif()
 file(GLOB KERNEL_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v3.cpp
 )
 
 if("${RUN_MODE}" STREQUAL "cpu")
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
new file mode 100644
index 000000000..7c6d310b7
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
@@ -0,0 +1,17 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include <cstdint>
+
+struct AddCustomTilingData {
+    uint32_t totalLength;
+};
+#endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
index 471821d26..bb6906014 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -7,37 +7,38 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
+#include "add_custom_tiling.h"
 #include "kernel_operator.h"
 
 using AscendC::TPosition;
 namespace {
-constexpr uint32_t TOTAL_LENGTH = 4096 * 16;                            // total length of data
 constexpr uint32_t TILE_LENGTH = 4096;
 }
 
 class KernelAdd {
 public:
     __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
     {
-        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
+        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
+        loopCount = totalLength / TILE_LENGTH;
     }
     __aicore__ inline void Process()
     {
-        uint32_t loopCount = TOTAL_LENGTH / TILE_LENGTH;
         AscendC::LocalTensor<float> xLocal(AscendC::TPosition::VECCALC, xAddr, TILE_LENGTH);
         AscendC::LocalTensor<float> yLocal(AscendC::TPosition::VECCALC, yAddr, TILE_LENGTH);
         AscendC::LocalTensor<float> zLocal(AscendC::TPosition::VECCALC, zAddr, TILE_LENGTH);
 
+        // one buffer
         for (int i = 0; i < loopCount; i++) {
             // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
             if (i != 0) {
                 AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
             }
-            DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
-            DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
             // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
             AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
@@ -53,7 +54,7 @@ public:
             // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
             AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
-            DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
             if (i != (loopCount - 1)) {
                 // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops
                 AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
@@ -68,18 +69,19 @@ private:
     AscendC::GlobalTensor<float> xGm;
     AscendC::GlobalTensor<float> yGm;
     AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
 };
 
-extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
     KernelAdd op;
-    op.Init(x, y, z);
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
     op.Process();
 }
 
 #ifndef ASCENDC_CPU_DEBUG
-void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
 {
-    add_custom_v1<<<blockDim, nullptr, stream>>>(x, y, z);
+    add_custom_v1<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
 }
 #endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index 65e7dd7e5..1f6f4df17 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -7,84 +7,138 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
+#include "add_custom_tiling.h"
 #include "kernel_operator.h"
 
 using AscendC::TPosition;
 namespace {
-constexpr int32_t TOTAL_LENGTH = 4096;        // total length of data
-constexpr int32_t BUFFER_NUM = 1;             // tensor num for each queue
-constexpr int32_t BANKGROUP_SIZE = 1024 * 64; // one bank size is 4KB, with 16 banks
-constexpr int32_t ONE_REPEAT_SIZE = 256;      // 256 bytes per repeat
-} // namespace
+constexpr uint32_t TILE_LENGTH = 4096;
+}
 
 class KernelAdd {
 public:
     __aicore__ inline KernelAdd() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
     {
-        xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH);
-        yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH);
-        zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH);
-        // xLocal size add 256 to avoid rr conflict
-        pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE);
-        // yLocal size adjust to 64KB - xLocal size to avoid rw conflict
-        pipe.InitBuffer(inQueueY, BUFFER_NUM, BANKGROUP_SIZE - (TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE));
-        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float));
+        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
+        loopCount = totalLength / TILE_LENGTH;
     }
     __aicore__ inline void Process()
     {
-        CopyIn();
-        Compute();
-        CopyOut();
-    }
+        // ping
+        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        // pong
+        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
 
-private:
-    __aicore__ inline void CopyIn()
-    {
-        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
-        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
-        AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH);
-        AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH);
-        inQueueX.EnQue(xLocal);
-        inQueueY.EnQue(yLocal);
-    }
-    __aicore__ inline void Compute()
-    {
-        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
-        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
-        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
-        AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH);
-        outQueueZ.EnQue<float>(zLocal);
-        inQueueX.FreeTensor(xLocal);
-        inQueueY.FreeTensor(yLocal);
-    }
-    __aicore__ inline void CopyOut()
-    {
-        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
-        AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH);
-        outQueueZ.FreeTensor(zLocal);
+        // double buffer
+        for (int i = 0; i < loopCount / 2; i++) {
+            // ping part
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            AscendC::DataCopy(xLocalPing, xGm[2 * i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPing, yGm[2 * i * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+            AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::DataCopy(zGm[2 * i * TILE_LENGTH], zLocalPing, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            }
+
+            // pong part
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops
+            if (i != 0) {
+                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
+            }
+            AscendC::DataCopy(xLocalPong, xGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPong, yGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPong/yLocalPong in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID1);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID1);
+            if (i != 0) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops
+                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
+            }
+            AscendC::Add(zLocalPong, xLocalPong, yLocalPong, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
+            }
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPong in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
+            AscendC::DataCopy(zGm[(2 * i + 1) * TILE_LENGTH], zLocalPong, TILE_LENGTH);
+            if (i != (loopCount - 1)) {
+                // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops
+                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
+            }
+        }
+
+        // tail block
+        if (loopCount % 2 != 0) {
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the front for loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
+            AscendC::DataCopy(xLocalPing, xGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocalPing, yGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH);
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
+            // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the front for loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
+            AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
+            AscendC::DataCopy(zGm[(loopCount - 1) * TILE_LENGTH], zLocalPing, TILE_LENGTH);
+        }
     }
 
 private:
-    AscendC::TPipe pipe;
-    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
-    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueY;
-    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    static constexpr uint32_t xAddrPing = 0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
+    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
     AscendC::GlobalTensor<float> xGm;
     AscendC::GlobalTensor<float> yGm;
     AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
 };
 
-extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
     KernelAdd op;
-    op.Init(x, y, z);
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
     op.Process();
 }
 
 #ifndef ASCENDC_CPU_DEBUG
-void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z)
+void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
 {
-    add_custom_v2<<<blockDim, nullptr, stream>>>(x, y, z);
+    add_custom_v2<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
 }
 #endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
new file mode 100644
index 000000000..f28463396
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -0,0 +1,93 @@
+/**
+ * @file add_custom_v3.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
+        loopCount = totalLength / TILE_LENGTH;
+    }
+
+    __aicore__ inline void Process()
+    {
+        // ping
+        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        // pong
+        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
+
+        // double buffer
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+        for (int i = 0; i < loopCount; i++) {
+            int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1);
+            AscendC::LocalTensor<float> &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong);
+            AscendC::LocalTensor<float> &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong);
+            AscendC::LocalTensor<float> &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+        }
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+
+private:
+    static constexpr uint32_t xAddrPing = 0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
+    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    KernelAdd op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v3<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
index 8a65f8fa6..287c33418 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
@@ -7,17 +7,20 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
+#include "add_custom_tiling.h"
 #include "data_utils.h"
 #ifndef ASCENDC_CPU_DEBUG
 #include "acl/acl.h"
-extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
-extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z);
-using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *);
+extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+using KernelEntry = void (*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *, uint8_t *);
 #else
 #include "tikicpulib.h"
-extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z);
-extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z);
-using KernelEntry = void(*)(GM_ADDR, GM_ADDR, GM_ADDR);
+extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+using KernelEntry = void (*)(GM_ADDR, GM_ADDR, GM_ADDR, GM_ADDR);
 
 #endif
 
@@ -29,12 +32,17 @@ struct ArgInfo {
 #ifndef ASCENDC_CPU_DEBUG
 
 void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector<ArgInfo> &inputsInfo,
-                std::vector<ArgInfo> &outputsInfo)
+                std::vector<ArgInfo> &outputsInfo, uint8_t *tiling)
 {
     std::vector<uint8_t *> inputHost(inputsInfo.size());
     std::vector<uint8_t *> inputDevice(inputsInfo.size());
     std::vector<uint8_t *> outputHost(outputsInfo.size());
     std::vector<uint8_t *> outputDevice(outputsInfo.size());
+    uint8_t *tilingDevice;
+
+    CHECK_ACL(aclrtMalloc((void **)(&tilingDevice), sizeof(AddCustomTilingData), ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, sizeof(AddCustomTilingData), tiling, sizeof(AddCustomTilingData),
+                          ACL_MEMCPY_HOST_TO_DEVICE));
 
     for (uint32_t i = 0; i < inputsInfo.size(); i++) {
         CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length));
@@ -49,8 +57,10 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
         CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST));
     }
 
-    kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0]);
+    kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0], tilingDevice);
     CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtFree(tilingDevice));
     for (uint32_t i = 0; i < outputsInfo.size(); i++) {
         CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length,
                               ACL_MEMCPY_DEVICE_TO_HOST));
@@ -67,7 +77,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
 
 #else
 
-#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo)                                  \
+#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo, tiling)                          \
     {                                                                                               \
         std::vector<uint8_t *> input(inputsInfo.size());                                            \
         std::vector<uint8_t *> output(outputsInfo.size());                                          \
@@ -82,7 +92,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
         }                                                                                           \
                                                                                                     \
         AscendC::SetKernelMode(KernelMode::AIV_MODE);                                               \
-        ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0]);                          \
+        ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0], tiling);                  \
         for (uint32_t i = 0; i < inputsInfo.size(); i++) {                                          \
             AscendC::GmFree((void *)input[i]);                                                      \
         }                                                                                           \
@@ -98,13 +108,16 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
 int32_t main(int32_t argc, char *argv[])
 {
     uint32_t blockDim = 1;
-    uint32_t dataLen = 4096;
+    uint32_t dataLen = 4096 * 3;
     size_t inputByteSize = dataLen * sizeof(float);
     size_t outputByteSize = dataLen * sizeof(float);
+    AddCustomTilingData tiling;
+    tiling.totalLength = dataLen;
 
     std::vector<ArgInfo> inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}};
     std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
     std::vector<ArgInfo> outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV3Info = {{"./output/output_z_v3.bin", outputByteSize}};
 
 #ifndef ASCENDC_CPU_DEBUG
     CHECK_ACL(aclInit(nullptr));
@@ -113,15 +126,17 @@ int32_t main(int32_t argc, char *argv[])
     aclrtStream stream = nullptr;
     CHECK_ACL(aclrtCreateStream(&stream));
 
-    KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info);
-    KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info);
+    KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v3, blockDim, stream, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
 
     CHECK_ACL(aclrtDestroyStream(stream));
     CHECK_ACL(aclrtResetDevice(deviceId));
     CHECK_ACL(aclFinalize());
 #else
-    KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info);
-    KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info);
+    KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v3, blockDim, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
 #endif
     return 0;
 }
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
index 10a011174..eb5f327ea 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
@@ -101,7 +101,8 @@ python3 scripts/gen_data.py
 (
     export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
     if [ "${RUN_MODE}" = "npu" ]; then
-        msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit
+        # msprof op --launch-count=3 --output=./prof ./ascendc_kernels_bbit
+        ./ascendc_kernels_bbit
     elif [ "${RUN_MODE}" = "sim" ]; then
         msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit
     elif [ "${RUN_MODE}" = "cpu" ]; then
@@ -111,3 +112,4 @@ python3 scripts/gen_data.py
 md5sum output/*.bin
 python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
 python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v3.bin output/golden.bin
-- 
Gitee


From a25c4cc9926336476c6bf58de37a7525ac276033 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Thu, 4 Sep 2025 11:18:03 +0800
Subject: [PATCH 03/13] fix code

---
 .../KernelLaunch/add_custom_v1.cpp                          | 6 +++---
 .../KernelLaunch/add_custom_v2.cpp                          | 6 +++---
 .../KernelLaunch/add_custom_v3.cpp                          | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
index bb6906014..77dbc459e 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -15,9 +15,9 @@ namespace {
 constexpr uint32_t TILE_LENGTH = 4096;
 }
 
-class KernelAdd {
+class KernelAddV1 {
 public:
-    __aicore__ inline KernelAdd() {}
+    __aicore__ inline KernelAddV1() {}
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
@@ -74,7 +74,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
-    KernelAdd op;
+    KernelAddV1 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
     op.Process();
 }
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index 1f6f4df17..c60601cc9 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -15,9 +15,9 @@ namespace {
 constexpr uint32_t TILE_LENGTH = 4096;
 }
 
-class KernelAdd {
+class KernelAddV2 {
 public:
-    __aicore__ inline KernelAdd() {}
+    __aicore__ inline KernelAddV2() {}
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
@@ -131,7 +131,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
-    KernelAdd op;
+    KernelAddV2 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
     op.Process();
 }
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
index f28463396..4a51f78dd 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -15,9 +15,9 @@ namespace {
 constexpr uint32_t TILE_LENGTH = 4096;
 }
 
-class KernelAdd {
+class KernelAddV3 {
 public:
-    __aicore__ inline KernelAdd() {}
+    __aicore__ inline KernelAddV3() {}
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
@@ -80,7 +80,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
-    KernelAdd op;
+    KernelAddV3 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
     op.Process();
 }
-- 
Gitee


From a868a6a06d024d06dfce83f197b7f8733b6f7eba Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Thu, 4 Sep 2025 11:47:18 +0800
Subject: [PATCH 04/13] fix loop event

---
 .../KernelLaunch/add_custom_v2.cpp                   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index c60601cc9..831a873dd 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -53,7 +53,7 @@ public:
                 AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
             }
             AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
-            if (i != (loopCount - 1)) {
+            if (i != (loopCount / 2 - 1)) {
                 // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops
                 AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
             }
@@ -61,7 +61,7 @@ public:
             AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID0);
             AscendC::DataCopy(zGm[2 * i * TILE_LENGTH], zLocalPing, TILE_LENGTH);
-            if (i != (loopCount - 1)) {
+            if (i != (loopCount / 2 - 1)) {
                 // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops
                 AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
             }
@@ -81,7 +81,7 @@ public:
                 AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
             }
             AscendC::Add(zLocalPong, xLocalPong, yLocalPong, TILE_LENGTH);
-            if (i != (loopCount - 1)) {
+            if (i != (loopCount / 2 - 1)) {
                 // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops
                 AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);
             }
@@ -89,7 +89,7 @@ public:
             AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(EVENT_ID1);
             AscendC::DataCopy(zGm[(2 * i + 1) * TILE_LENGTH], zLocalPong, TILE_LENGTH);
-            if (i != (loopCount - 1)) {
+            if (i != (loopCount / 2 - 1)) {
                 // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops
                 AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
             }
@@ -97,7 +97,7 @@ public:
 
         // tail block
         if (loopCount % 2 != 0) {
-            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the front for loop
+            // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the previous for loop
             AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
             AscendC::DataCopy(xLocalPing, xGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH);
@@ -105,7 +105,7 @@ public:
             // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one loop
             AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(EVENT_ID0);
-            // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the front for loop
+            // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the previous for loop
             AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
             AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
             AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH);
-- 
Gitee


From 77e1ed5aee8a6d75866e1f2c1caa3df666f37df6 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 10:54:02 +0800
Subject: [PATCH 05/13] add multi cores

---
 .../KernelLaunch/add_custom_tiling.h                 |  2 +-
 .../KernelLaunch/add_custom_v1.cpp                   | 12 ++++++------
 .../KernelLaunch/add_custom_v2.cpp                   | 12 ++++++------
 .../KernelLaunch/add_custom_v3.cpp                   | 12 ++++++------
 .../KernelLaunch/main.cpp                            |  6 +++---
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
index 7c6d310b7..278a6e336 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h
@@ -12,6 +12,6 @@
 #include <cstdint>
 
 struct AddCustomTilingData {
-    uint32_t totalLength;
+    uint32_t singleCoreLength;
 };
 #endif
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
index 77dbc459e..c7aa904f3 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096;
 class KernelAddV1 {
 public:
     __aicore__ inline KernelAddV1() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
-        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
-        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
-        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
-        loopCount = totalLength / TILE_LENGTH;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
     }
     __aicore__ inline void Process()
     {
@@ -75,7 +75,7 @@ private:
 extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
     KernelAddV1 op;
-    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
 }
 
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index 831a873dd..6871ba70d 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096;
 class KernelAddV2 {
 public:
     __aicore__ inline KernelAddV2() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
-        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
-        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
-        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
-        loopCount = totalLength / TILE_LENGTH;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
     }
     __aicore__ inline void Process()
     {
@@ -132,7 +132,7 @@ private:
 extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
     KernelAddV2 op;
-    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
 }
 
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
index 4a51f78dd..e91fd9bdb 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096;
 class KernelAddV3 {
 public:
     __aicore__ inline KernelAddV3() {}
-    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
-        xGm.SetGlobalBuffer((__gm__ float *)x, totalLength);
-        yGm.SetGlobalBuffer((__gm__ float *)y, totalLength);
-        zGm.SetGlobalBuffer((__gm__ float *)z, totalLength);
-        loopCount = totalLength / TILE_LENGTH;
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
     }
 
     __aicore__ inline void Process()
@@ -81,7 +81,7 @@ private:
 extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
     KernelAddV3 op;
-    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength);
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
 }
 
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
index 287c33418..8b6b2f1e0 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
@@ -107,12 +107,12 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
 
 int32_t main(int32_t argc, char *argv[])
 {
-    uint32_t blockDim = 1;
-    uint32_t dataLen = 4096 * 3;
+    uint32_t blockDim = 8;
+    uint32_t dataLen = 4096 * 9 * blockDim;
     size_t inputByteSize = dataLen * sizeof(float);
     size_t outputByteSize = dataLen * sizeof(float);
     AddCustomTilingData tiling;
-    tiling.totalLength = dataLen;
+    tiling.singleCoreLength = dataLen / blockDim;
 
     std::vector<ArgInfo> inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}};
     std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
-- 
Gitee


From f341d2207f71f223b5faf40b103449be75571df7 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 10:59:58 +0800
Subject: [PATCH 06/13] fix typos

---
 .../KernelLaunch/main.cpp                                     | 1 +
 .../KernelLaunch/scripts/gen_data.py                          | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
index 8b6b2f1e0..6591e50da 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
@@ -108,6 +108,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v
 int32_t main(int32_t argc, char *argv[])
 {
     uint32_t blockDim = 8;
+    // set data length, in this case we use 8 cores and length of each core is 4096 * 9
     uint32_t dataLen = 4096 * 9 * blockDim;
     size_t inputByteSize = dataLen * sizeof(float);
     size_t outputByteSize = dataLen * sizeof(float);
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
index 86bbba89d..a77774b9a 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
@@ -12,8 +12,8 @@ import numpy as np
 
 
 def gen_golden_data_simple():
-    input_x = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
-    input_y = np.random.uniform(1, 100, [1, 4096]).astype(np.float32)
+    input_x = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32)
+    input_y = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32)
     golden = (input_x + input_y).astype(np.float32)
 
     input_x.tofile("./input/input_x.bin")
-- 
Gitee


From a900629f4d4a1935c958154fcdb5879454324313 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 11:31:10 +0800
Subject: [PATCH 07/13] add local mem allocator

---
 .../KernelLaunch/CMakeLists.txt               |  1 +
 .../KernelLaunch/add_custom_v3.cpp            | 20 ++--
 .../KernelLaunch/add_custom_v4.cpp            | 93 +++++++++++++++++++
 .../KernelLaunch/main.cpp                     |  5 +
 4 files changed, 107 insertions(+), 12 deletions(-)
 create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
index ab71257d0..e31108ae1 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt
@@ -19,6 +19,7 @@ file(GLOB KERNEL_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v3.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v4.cpp
 )
 
 if("${RUN_MODE}" STREQUAL "cpu")
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
index e91fd9bdb..0f9aa7777 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -28,14 +28,16 @@ public:
 
     __aicore__ inline void Process()
     {
+        // use local memory allocator to simplify memor allocation
+        AscendC::LocalMemAllocator<AscendC::Hardware::UB> ubAllocator;
         // ping
-        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
-        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
-        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> xLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> yLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> zLocalPing = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
         // pong
-        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
-        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
-        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> xLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> yLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
+        AscendC::LocalTensor<float> zLocalPong = ubAllocator.Alloc<AscendC::TPosition::VECCALC, float, TILE_LENGTH>();
 
         // double buffer
         AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
@@ -66,12 +68,6 @@ public:
     }
 
 private:
-    static constexpr uint32_t xAddrPing = 0;
-    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
-    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
-    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
-    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
-    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
     AscendC::GlobalTensor<float> xGm;
     AscendC::GlobalTensor<float> yGm;
     AscendC::GlobalTensor<float> zGm;
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
new file mode 100644
index 000000000..ef4aa0cc2
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
@@ -0,0 +1,93 @@
+/**
+ * @file add_custom_v4.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "kernel_operator.h"
+
+using AscendC::TPosition;
+namespace {
+constexpr uint32_t TILE_LENGTH = 4096;
+}
+
+class KernelAddV4 {
+public:
+    __aicore__ inline KernelAddV4() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
+    {
+        xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
+        loopCount = singleCoreLength / TILE_LENGTH;
+    }
+
+    __aicore__ inline void Process()
+    {
+        // ping
+        AscendC::LocalTensor<float> xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH);
+        // pong
+        AscendC::LocalTensor<float> xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH);
+        AscendC::LocalTensor<float> zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH);
+
+        // double buffer
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+        for (int i = 0; i < loopCount; i++) {
+            int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1);
+            AscendC::LocalTensor<float> &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong);
+            AscendC::LocalTensor<float> &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong);
+            AscendC::LocalTensor<float> &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops
+            AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+            AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH);
+            AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH);
+
+            // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(eventID);
+            AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+            // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventID);
+            AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH);
+            // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops
+            AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventID);
+        }
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
+    }
+
+private:
+    static constexpr uint32_t xAddrPing = 0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
+    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
+    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t loopCount;
+};
+
+extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
+{
+    KernelAddV4 op;
+    op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
+    op.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling)
+{
+    add_custom_v4<<<blockDim, nullptr, stream>>>(x, y, z, tiling);
+}
+#endif
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
index 6591e50da..a64889808 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp
@@ -14,12 +14,14 @@
 extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
 extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
 extern void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
+extern void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling);
 using KernelEntry = void (*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *, uint8_t *);
 #else
 #include "tikicpulib.h"
 extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
 extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
 extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
+extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling);
 using KernelEntry = void (*)(GM_ADDR, GM_ADDR, GM_ADDR, GM_ADDR);
 
 #endif
@@ -119,6 +121,7 @@ int32_t main(int32_t argc, char *argv[])
     std::vector<ArgInfo> outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}};
     std::vector<ArgInfo> outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}};
     std::vector<ArgInfo> outputsV3Info = {{"./output/output_z_v3.bin", outputByteSize}};
+    std::vector<ArgInfo> outputsV4Info = {{"./output/output_z_v4.bin", outputByteSize}};
 
 #ifndef ASCENDC_CPU_DEBUG
     CHECK_ACL(aclInit(nullptr));
@@ -130,6 +133,7 @@ int32_t main(int32_t argc, char *argv[])
     KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
     KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
     KernelCall(add_custom_do_v3, blockDim, stream, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_do_v4, blockDim, stream, inputsInfo, outputsV4Info, (uint8_t *)&tiling);
 
     CHECK_ACL(aclrtDestroyStream(stream));
     CHECK_ACL(aclrtResetDevice(deviceId));
@@ -138,6 +142,7 @@ int32_t main(int32_t argc, char *argv[])
     KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info, (uint8_t *)&tiling);
     KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info, (uint8_t *)&tiling);
     KernelCall(add_custom_v3, blockDim, inputsInfo, outputsV3Info, (uint8_t *)&tiling);
+    KernelCall(add_custom_v4, blockDim, inputsInfo, outputsV4Info, (uint8_t *)&tiling);
 #endif
     return 0;
 }
-- 
Gitee


From 6d9ad98008e3b0de81a3b946767b94a6221317bb Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 11:40:09 +0800
Subject: [PATCH 08/13] add  case 4 in run.sh

---
 .../KernelLaunch/run.sh                                          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
index eb5f327ea..e5ab94cab 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
@@ -113,3 +113,4 @@ md5sum output/*.bin
 python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
 python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
 python3 scripts/verify_result.py output/output_z_v3.bin output/golden.bin
+python3 scripts/verify_result.py output/output_z_v4.bin output/golden.bin
-- 
Gitee


From e9196989785fc9456163dc15129022e449b88ef5 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 14:53:19 +0800
Subject: [PATCH 09/13] edit case4

---
 .../KernelLaunch/add_custom_v4.cpp            | 17 +++++++++++------
 .../KernelLaunch/run.sh                       | 19 +++++++++++++------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
index ef4aa0cc2..708a303f7 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
@@ -13,6 +13,7 @@
 using AscendC::TPosition;
 namespace {
 constexpr uint32_t TILE_LENGTH = 4096;
+constexpr uint32_t DST_START_ADDRESS = 0x20000;
 }
 
 class KernelAddV4 {
@@ -66,12 +67,16 @@ public:
     }
 
 private:
-    static constexpr uint32_t xAddrPing = 0;
-    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float);
-    static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2;
-    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3;
-    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4;
-    static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5;
+    // according to bank conflict rule:
+    // rr conflict happened when 2 read requests are in the same bank group
+    // rw conflict happened when read and write requests are in the same bank
+    // so we adjust the address to avoid bank conflicts
+    static constexpr uint32_t xAddrPing = 0x0;
+    static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float) + 256;
+    static constexpr uint32_t zAddrPing = DST_START_ADDRESS;
+    static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 2 + 256;
+    static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 3 + 512;
+    static constexpr uint32_t zAddrPong = DST_START_ADDRESS + TILE_LENGTH * sizeof(float);
     AscendC::GlobalTensor<float> xGm;
     AscendC::GlobalTensor<float> yGm;
     AscendC::GlobalTensor<float> zGm;
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
index e5ab94cab..435f708fa 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
@@ -100,15 +100,22 @@ mkdir -p input output
 python3 scripts/gen_data.py
 (
     export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
-    if [ "${RUN_MODE}" = "npu" ]; then
-        # msprof op --launch-count=3 --output=./prof ./ascendc_kernels_bbit
-        ./ascendc_kernels_bbit
-    elif [ "${RUN_MODE}" = "sim" ]; then
-        msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit
-    elif [ "${RUN_MODE}" = "cpu" ]; then
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --launch-count=4 --output=./prof ./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --launch-count=4 --output=./prof ./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
         ./ascendc_kernels_bbit
     fi
 )
+# tidy folder by delete log files
+if [ "${RUN_MODE}" = "sim" ]; then
+    rm -f *.log *.dump *.vcd *.toml *_log
+fi
 md5sum output/*.bin
 python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin
 python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin
-- 
Gitee


From 54ef230855865a7bc34e4e17bc50ccdac31d819f Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 15:41:46 +0800
Subject: [PATCH 10/13] fix typos and readme

---
 .../KernelLaunch/README.md                    | 19 ++++++++++++------
 .../KernelLaunch/data_utils.h                 |  2 +-
 .../KernelLaunch/run.sh                       |  2 --
 .../KernelLaunch/scripts/gen_data.py          |  2 +-
 .../KernelLaunch/scripts/verify_result.py     |  2 +-
 .../README.md                                 | 20 +++++++++----------
 6 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
index f72b521cd..84edca8e7 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
@@ -6,8 +6,11 @@
 │   ├── scripts
 │   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
 │   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom_tiling.h     // tiling结构体
 │   ├── add_custom_v1.cpp       // 算子kernel实现1：未优化前实现
-│   ├── add_custom_v2.cpp       // 算子kernel实现2：优化地址分配，消除Bank冲突后的实现
+│   ├── add_custom_v2.cpp       // 算子kernel实现2：基于实现1，实现double buffer
+│   ├── add_custom_v3.cpp       // 算子kernel实现3：优化double buffer实现，简化判断逻辑，并使用LocalMemAllocator简化代码
+│   ├── add_custom_v4.cpp       // 算子kernel实现4：基于add_custom_v3，修改地址分配逻辑，消除bank冲突
 │   ├── CMakeLists.txt          // 编译工程文件
 │   ├── data_utils.h            // 数据读入写出函数
 │   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
@@ -16,7 +19,7 @@
 
 ## 代码实现介绍
 
-本样例中实现的是固定shape为1*4096的Add算子。
+本样例中实现的是固定shape为8*9*4096的Add算子。
 
 - kernel实现
 
@@ -30,9 +33,13 @@
 
   Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
 
-  实现1：请参考[add_custom_v1.cpp](./add_custom_v1.cpp)，xLocal地址为0，yLocal地址为0x4000，zLocal地址为0x8000。xLocal与yLocal存在读读冲突，xLocal与zLocal存在读写冲突。
+  实现1：请参考[add_custom_v1.cpp](./add_custom_v1.cpp)，使用静态Tensor编程方法，进行add算子的编程。
 
-  实现2：请参考[add_custom_v2.cpp](./add_custom_v2.cpp)，为了避免Bank冲突，通过配置InitBuffer时的bufferSize来调整Tensor地址，xLocal地址为0，yLocal地址为0x4100，zLocal地址为0x10000。
+  实现2：请参考[add_custom_v2.cpp](./add_custom_v2.cpp)，优化性能，使用double buffer进行流水排布。
+
+  实现3：请参考[add_custom_v3.cpp](./add_custom_v3.cpp)，优化add_custom_v2中的反向同步及判断逻辑，同时使用LocalMemAllocator进行线性内存分配。
+
+  实现4：请参考[add_custom_v4.cpp](./add_custom_v4.cpp)，基于add_custom_v3的实现，优化地址分配消除Bank冲突。
 - 调用实现
 
   1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
@@ -46,7 +53,7 @@
   以命令行方式下载样例代码，master分支为例。
 
   ```bash
-  cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch
+  cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch
   ```
 - 配置环境变量
 
@@ -85,4 +92,4 @@
 
 | 时间       | 更新事项     |
 | ---------- | ------------ |
-| 2025/07/01 | 新增本readme |
+| 2025/09/06 | 新增本readme |
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
index 09d906371..9d3445780 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h
@@ -1,7 +1,7 @@
 /**
  * @file data_utils.h
  *
- * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
index 435f708fa..6c691801e 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh
@@ -11,7 +11,6 @@ SHORT=r:,v:,i:,b:,p:,
 LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
-SOC_VERSION="Ascend310P3"
 
 while :; do
     case "$1" in
@@ -72,7 +71,6 @@ fi
 
 export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
 export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
-echo "Current compile soc version is ${SOC_VERSION}"
 source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
 if [ "${RUN_MODE}" = "sim" ]; then
     # in case of running op in simulator, use stub .so instead
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
index a77774b9a..b8f7ccb5b 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 # coding=utf-8
 #
-# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
index 6a38a3b2b..a5019f30f 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python3
 # coding=utf-8
 #
-# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
index 4ea8f29b8..c3448a15f 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
@@ -6,12 +6,12 @@
 
 ```
 ├── 23_static_tensor_programming_kernel_launch      // 使用核函数直调的方式调用Add自定义算子
-│   └── KernelLaunch     // Kernel Launch方式调用核函数样例
+│   └── KernelLaunch                                // Kernel Launch方式调用核函数样例
 ```
 
 ## 算子描述
 
-算子实现的是固定shape为1×4096的Add算子。
+算子实现的是固定shape为8x9×4096的Add算子。
 
 Add的计算公式为：
 
@@ -19,9 +19,9 @@ Add的计算公式为：
 z = x + y
 ```
 
-- x：输入，形状为\[1, 4096]，数据类型为float；
-- y：输入，形状为\[1, 4096]，数据类型为float；
-- z：输出，形状为\[1, 4096]，数据类型为float；
+- x：输入，形状为\[72, 4096]，数据类型为float；
+- y：输入，形状为\[72, 4096]，数据类型为float；
+- z：输出，形状为\[72, 4096]，数据类型为float；
 
 ## 算子规格描述
 
@@ -29,13 +29,13 @@ z = x + y
 <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
 </tr>
 <tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
-<tr><td align="center">x</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
-<tr><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">x</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
 </tr>
 </tr>
-<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">1 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">72 * 4096</td><td align="center">float</td><td align="center">ND</td></tr>
 </tr>
-<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_v1 / add_custom_v2</td></tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom_v1 / add_custom_v2 / add_custom_v3 / add_custom_v4</td></tr>
 </table>
 
 ## 支持的产品型号
@@ -66,4 +66,4 @@ z = x + y
 
 | 时间       | 更新事项         |
 | ---------- | ---------------- |
-| 2025/07/01 | 新增直调方式样例 |
+| 2025/09/06 | 新增直调方式样例 |
-- 
Gitee


From 51f72166cf627e3246cda15600b28bb49673339e Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 15:52:45 +0800
Subject: [PATCH 11/13] fix typos

---
 .../KernelLaunch/README.md                                      | 2 +-
 .../23_static_tensor_programming_kernel_launch/README.md        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
index 84edca8e7..f58858378 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md
@@ -19,7 +19,7 @@
 
 ## 代码实现介绍
 
-本样例中实现的是固定shape为8*9*4096的Add算子。
+本样例中实现的是固定shape为72*4096的Add算子。
 
 - kernel实现
 
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
index c3448a15f..818d7960f 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md
@@ -11,7 +11,7 @@
 
 ## 算子描述
 
-算子实现的是固定shape为8x9×4096的Add算子。
+算子实现的是固定shape为72×4096的Add算子。
 
 Add的计算公式为：
 
-- 
Gitee


From 832a76e8737c523ccbb76ce0ed025af0af840536 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Sat, 6 Sep 2025 17:35:13 +0800
Subject: [PATCH 12/13] add soc state init

---
 .../KernelLaunch/add_custom_v1.cpp                               | 1 +
 .../KernelLaunch/add_custom_v2.cpp                               | 1 +
 .../KernelLaunch/add_custom_v3.cpp                               | 1 +
 .../KernelLaunch/add_custom_v4.cpp                               | 1 +
 4 files changed, 4 insertions(+)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
index c7aa904f3..c642b9b86 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -74,6 +74,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
+    AscendC::InitSocState();
     KernelAddV1 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index 6871ba70d..857b2bbbc 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -131,6 +131,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
+    AscendC::InitSocState();
     KernelAddV2 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
index 0f9aa7777..ba088efd7 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -76,6 +76,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
+    AscendC::InitSocState();
     KernelAddV3 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
index 708a303f7..e3c05bfd2 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp
@@ -85,6 +85,7 @@ private:
 
 extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling)
 {
+    AscendC::InitSocState();
     KernelAddV4 op;
     op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength);
     op.Process();
-- 
Gitee


From 1e2c2fe6670123b53ac536216a058545be57cf44 Mon Sep 17 00:00:00 2001
From: zhanghao0689 <zhanghao152@huawei.com>
Date: Mon, 8 Sep 2025 09:43:24 +0800
Subject: [PATCH 13/13] fix constructon func

---
 .../KernelLaunch/add_custom_v1.cpp                              | 2 +-
 .../KernelLaunch/add_custom_v2.cpp                              | 2 +-
 .../KernelLaunch/add_custom_v3.cpp                              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
index c642b9b86..e670aa2a4 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp
@@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096;
 
 class KernelAddV1 {
 public:
-    __aicore__ inline KernelAddV1() {}
+    __aicore__ inline KernelAddV1() = default;
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
index 857b2bbbc..fca2299b1 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp
@@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096;
 
 class KernelAddV2 {
 public:
-    __aicore__ inline KernelAddV2() {}
+    __aicore__ inline KernelAddV2() = default;
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
index ba088efd7..4369fe980 100644
--- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
+++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp
@@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096;
 
 class KernelAddV3 {
 public:
-    __aicore__ inline KernelAddV3() {}
+    __aicore__ inline KernelAddV3() = default;
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength)
     {
         xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength);
-- 
Gitee