From 5a2a674bf30e23eaa2372208159d85a9198542e2 Mon Sep 17 00:00:00 2001
From: ApeiriaNode_Booker <renjie88@huawei.com>
Date: Mon, 22 Sep 2025 10:39:43 +0800
Subject: [PATCH 1/2] add simple matmul leakyrelu

---
 .../24_simple_hello_world/README.md           |   6 +-
 .../0_introduction/25_simple_add/README.md    |   6 +-
 .../26_simple_matmulleakyrelu/CMakeLists.txt  |  22 ++
 .../26_simple_matmulleakyrelu/README.md       | 101 +++++
 .../26_simple_matmulleakyrelu/data_utils.h    |  96 +++++
 .../matmul_leakyrelu.cpp                      | 350 ++++++++++++++++++
 .../26_simple_matmulleakyrelu/run.sh          |   8 +
 .../scripts/gen_data.py                       |  35 ++
 .../scripts/verify_result.py                  |  55 +++
 9 files changed, 673 insertions(+), 6 deletions(-)
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp
 create mode 100755 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py
 create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py

diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
index 53ab1d8cc..20c1ee4c8 100644
--- a/operator/ascendc/0_introduction/24_simple_hello_world/README.md
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
@@ -10,14 +10,14 @@
 
 ## 支持的产品型号
 本样例支持如下产品型号：
-- Atlas A2训练系列产品/Atlas 800I A2推理产品
-
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
 
 ## 运行样例算子
   - 打开样例目录
     以命令行方式下载样例代码，master分支为例。
     ```bash
-    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld
     ```
   - 配置环境变量
 
diff --git a/operator/ascendc/0_introduction/25_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md
index ffa237fc2..6f6600a15 100644
--- a/operator/ascendc/0_introduction/25_simple_add/README.md
+++ b/operator/ascendc/0_introduction/25_simple_add/README.md
@@ -41,14 +41,14 @@ z = x + y
 
 ## 支持的产品型号
 本样例支持如下产品型号：
-- Atlas A2训练系列产品/Atlas 800I A2推理产品
-
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
 
 ## 运行样例算子
   - 打开样例目录   
     以命令行方式下载样例代码，master分支为例。
     ```bash
-    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add/
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add
     ```
   - 配置环境变量
 
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
new file mode 100644
index 000000000..c833ed1fd
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+set_source_files_properties(
+    matmul_leakyrelu.cpp PROPERTIES LANGUAGE ASC
+)
+
+add_executable(demo
+    matmul_leakyrelu.cpp
+)
+
+target_link_libraries(demo PRIVATE
+    tiling_api
+    register
+    platform
+    m
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
new file mode 100644
index 000000000..a8b9dbd70
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
@@ -0,0 +1,101 @@
+## 简化MatmulLeakyRelu算子直调样例
+本样例以MatmulLeakyRelu算子为示例，展示了一种更为简单的算子编译流程，支持main函数和Kernel函数在同一个cpp文件中实现。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+
+## 目录结构介绍
+```
+├── 26_simple_matmulleakyrelu
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── matmul_leakyrelu.cpp    // 算子实现及测试
+│   ├── run.sh                  // 运行脚本
+│   └── scripts
+│       ├── gen_data.py         // 输入数据和真值数据生成脚本文件
+│       └── verify_result.py    // 真值对比文件
+```
+
+## 算子描述
+算子使用了MatmulLeakyRelu高阶API，实现了快速的MatmulLeakyRelu矩阵乘法的运算操作。
+
+MatmulLeakyRelu的计算公式为：
+
+```
+C = A * B + Bias
+C = C > 0 ? C : C * 0.001
+```
+
+- A、B为源操作数，A为左矩阵，形状为\[M, K]；B为右矩阵，形状为\[K, N]。
+- C为目的操作数，存放矩阵乘结果的矩阵，形状为\[M, N]。
+- Bias为矩阵乘偏置，形状为\[N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">MatmulLeakyRelu</td></tr>
+</tr>
+<tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">a</td><td align="center">1024 * 256</td><td align="center">float16</td><td align="center">ND</td></tr>
+<tr><td align="center">b</td><td align="center">256 * 640</td><td align="center">float16</td><td align="center">ND</td></tr>
+<tr><td align="center">bias</td><td align="center">640</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">1024 * 640</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">matmul_leakyrelu_custom</td></tr>
+</table>
+
+## 代码实现介绍
+本样例中实现的是[m, n, k]固定为[1024, 640, 256]的MatmulLeakyRelu算子。
+- kernel实现  
+  MatmulLeakyRelu算子的数学表达式为：
+  ```
+  C = A * B + Bias
+  C = C > 0 ? C : C * 0.001
+  ```
+  其中A的形状为[1024, 256]，B的形状为[256, 640]，C的形状为[1024, 640]，Bias的形状为[640]。具体请参考[matmul_leakyrelu.cpp](./matmul_leakyrelu.cpp)。
+
+- 调用实现  
+  使用内核调用符<<<>>>调用核函数。
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
+- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/26_simple_matmulleakyrelu
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+  - 样例执行
+    ```bash
+    bash run.sh  # 编译并执行样例
+    ```
+
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/22 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h
new file mode 100644
index 000000000..582fbf68f
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h
@@ -0,0 +1,96 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp
new file mode 100644
index 000000000..6c1e6136a
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp
@@ -0,0 +1,350 @@
+/**
+ * @file matmul_leakyrelu_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "acl/acl.h"
+#include "tiling/tiling_api.h"
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+using namespace matmul;
+
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM)
+{
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+template <typename aType, typename bType, typename cType, typename biasType> class MatmulLeakyKernel {
+public:
+    __aicore__ inline MatmulLeakyKernel(){};
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                const TCubeTiling &tiling, AscendC::TPipe *pipe);
+    __aicore__ inline void Process(AscendC::TPipe *pipe);
+
+    __aicore__ inline void MatmulCompute();
+    __aicore__ inline void LeakyReluCompute();
+    __aicore__ inline void CopyOut(uint32_t count);
+    __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB,
+                                      int32_t &offsetC, int32_t &offsetBias);
+
+    Matmul<MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+           MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, cType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>>
+        matmulObj;
+
+    AscendC::GlobalTensor<aType> aGlobal;
+    AscendC::GlobalTensor<bType> bGlobal;
+    AscendC::GlobalTensor<cType> cGlobal;
+    AscendC::GlobalTensor<biasType> biasGlobal;
+    AscendC::LocalTensor<cType> reluOutLocal;
+    TCubeTiling tiling;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> reluOutQueue_;
+};
+
+/**
+  * @brief  Set matmulLeaky input and output gm addr of current core.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tiling: matmul tiling data.
+  * @param  pipe: Global memory and sync management TPipe object.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                                                                              GM_ADDR c, GM_ADDR workspace,
+                                                                              const TCubeTiling &tiling, AscendC::TPipe *pipe)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N);
+
+    int32_t offsetA, offsetB, offsetC, offsetBias;
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); // Calculate the gm offset based on the blockidx.
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType)); // Init output buffer.
+}
+
+/**
+  * @brief  Main process of matmul calculation
+  * @param  pipe: Global memory and sync management TPipe object.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe *pipe)
+{
+    uint32_t computeRound = 0;
+
+    matmulObj.SetTensorA(aGlobal);
+    matmulObj.SetTensorB(bGlobal);
+    matmulObj.SetBias(biasGlobal);
+    while (matmulObj.template Iterate<true>()) { // Once Iterate, compute baseM * baseN, sync is set true here.
+        MatmulCompute(); // Get matmul compute result.
+        LeakyReluCompute(); // Compute leakyRelu.
+        CopyOut(computeRound); // Copy leakyRelu out result to GM.
+        computeRound++;
+    }
+    matmulObj.End();
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::MatmulCompute()
+{
+    reluOutLocal = reluOutQueue_.AllocTensor<cType>();
+    matmulObj.template GetTensorC<true>(reluOutLocal, false, true);
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::LeakyReluCompute()
+{
+    LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN);
+    reluOutQueue_.EnQue(reluOutLocal);
+}
+
+/**
+  * @brief  Copy leakyRelu out result to GM.
+  * @param  count: Iterate count(once Iterate, compute baseM * baseN).
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::CopyOut(uint32_t count)
+{
+    reluOutQueue_.DeQue<cType>();
+    const uint32_t roundM = tiling.singleCoreM / tiling.baseM;
+    const uint32_t roundN = tiling.singleCoreN / tiling.baseN;
+    uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN);
+    AscendC::DataCopyParams copyParam = {(uint16_t)tiling.baseM, (uint16_t)(tiling.baseN * sizeof(cType) / AscendC::DEFAULT_C0_SIZE), 0,
+                                (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / AscendC::DEFAULT_C0_SIZE)};
+    DataCopy(cGlobal[startOffset], reluOutLocal, copyParam);
+    reluOutQueue_.FreeTensor(reluOutLocal);
+}
+
+/**
+  * @brief  Calculate the gm offset based on the blockidx.
+  * @param  blockIdx: Current Core blockidx.
+  * @param  tiling: Matmul tiling data.
+  * @param  offsetA: Gm offset of A matrix.
+  * @param  offsetB: Gm offset of B matrix.
+  * @param  offsetC: Gm offset of C matrix.
+  * @param  offsetBias: Gm offset of Bias matrix.
+  * @retval None
+  */
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void
+MatmulLeakyKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling,
+                                                             int32_t &offsetA, int32_t &offsetB, int32_t &offsetC,
+                                                             int32_t &offsetBias)
+{
+    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
+    auto mCoreIndx = blockIdx % mSingleBlocks;
+    auto nCoreIndx = blockIdx / mSingleBlocks;
+
+    offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM;
+    offsetB = nCoreIndx * tiling.singleCoreN;
+    offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN;
+    offsetBias = nCoreIndx * tiling.singleCoreN;
+}
+
+/**
+  * @brief  matmul_leakyrelu kernel function entry
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias gm addr.
+  * @param  c: Out gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+__global__ __aicore__ void matmul_leakyrelu_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c,
+                                                              GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    AscendC::TPipe pipe;
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+
+    MatmulLeakyKernel<half, half, float, float> matmulLeakyKernel;
+    matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe);
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling); // Initialize the matmul object.
+    matmulLeakyKernel.Process(&pipe);
+}
+
+/**
+  * @brief  Generate matmul tiling.
+  * @param  socVersion: Platform socversion.
+  * @param  tilingBuf data buffer.
+  */
+void GenerateTiling(const char *socVersion, uint8_t *tilingBuf)
+{
+    using TPosition = matmul_tiling::TPosition;
+    using CubeFormat = matmul_tiling::CubeFormat;
+    using DataType = matmul_tiling::DataType;
+    using namespace std;
+    int M = 1024;
+    int N = 640;
+    int K = 256;
+
+    TPosition leftPosition = TPosition::GM;
+    CubeFormat leftFormat = CubeFormat::ND;
+    DataType leftDtype = DataType::DT_FLOAT16;
+    bool isTransA = false;
+
+    TPosition rightPosition = TPosition::GM;
+    CubeFormat rightFormat = CubeFormat::ND;
+    DataType rightDtype = DataType::DT_FLOAT16;
+    bool isTransB = false;
+
+    TPosition resultPosition = TPosition::GM;
+    CubeFormat resultFormat = CubeFormat::ND;
+    DataType resultDtype = DataType::DT_FLOAT;
+
+    TPosition biasPosition = TPosition::GM;
+    CubeFormat biasFormat = CubeFormat::ND;
+    DataType biasDtype = DataType::DT_FLOAT;
+    bool isBias = true;
+
+    int usedCoreNum = 2;
+    int baseM = 256;
+    int baseN = 128;
+
+    optiling::TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    matmul_tiling::MultiCoreMatmulTiling tilingApi(*ascendcPlatform);
+
+    tilingApi.SetDim(usedCoreNum); // Set the number of cores that participate in multi-core computaion is 2.
+    tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA);
+    tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB);
+    tilingApi.SetCType(resultPosition, resultFormat, resultDtype);
+    tilingApi.SetBiasType(biasPosition, biasFormat, biasDtype);
+
+    tilingApi.SetOrgShape(M, N, K);
+    tilingApi.SetShape(M, N, K);
+    tilingApi.SetBias(isBias);
+    tilingApi.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTM); // Set the matmul travse is FIRSTM.
+    tilingApi.SetFixSplit(baseM, baseN, -1); // Set the fixed baseM=128, baseN=256.
+    tilingApi.SetBufferSpace(-1, -1, -1);
+
+    int64_t res = tilingApi.GetTiling(tilingData); // Get matmul tiling data.
+    tilingData.set_stepM(1); // Set the matmul tiling stepM=1.
+    tilingData.set_stepN(1); // Set the matmul tiling stepN=1.
+    if (res == -1) {
+        std::cout << "gen tiling failed" << std::endl;
+    }
+    uint32_t tcubeTilingSize = tilingData.GetDataSize();
+    tilingData.SaveToBuffer(tilingBuf, tcubeTilingSize);
+    return;
+}
+
+int32_t main(int32_t argc, char *argv[])
+{
+    const char *socVersion = "Ascend910B1";
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    size_t aFileSize = 262144 * sizeof(int16_t);
+    size_t bFileSize = 163840 * sizeof(int16_t);
+    size_t cFileSize = 655360 * sizeof(float);
+    size_t biasFileSize = 640 * sizeof(float);
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    size_t userWorkspaceSize = 0;
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+    size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize;
+    uint8_t *tilingBuf = (uint8_t *)malloc(tilingFileSize);
+    GenerateTiling(socVersion, tilingBuf);
+    uint32_t blockDim = 1;
+
+    aclInit(nullptr);
+    int32_t deviceId = 0;
+    aclrtSetDevice(deviceId);
+    aclrtStream stream = nullptr;
+    aclrtCreateStream(&stream);
+
+    uint8_t *inputAHost;
+    uint8_t *inputADevice;
+    aclrtMallocHost((void **)(&inputAHost), aFileSize);
+    aclrtMalloc((void **)&inputADevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    ReadFile("./input/x1_gm.bin", aFileSize, inputAHost, aFileSize);
+    aclrtMemcpy(inputADevice, aFileSize, inputAHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    uint8_t *inputBHost;
+    uint8_t *inputBDevice;
+    aclrtMallocHost((void **)(&inputBHost), bFileSize);
+    aclrtMalloc((void **)&inputBDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    ReadFile("./input/x2_gm.bin", bFileSize, inputBHost, bFileSize);
+    aclrtMemcpy(inputBDevice, bFileSize, inputBHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    uint8_t *outputCHost;
+    uint8_t *outputCDevice;
+    aclrtMallocHost((void **)(&outputCHost), cFileSize);
+    aclrtMalloc((void **)&outputCDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+    uint8_t *inputBiasHost;
+    uint8_t *inputBiasDevice;
+    aclrtMallocHost((void **)(&inputBiasHost), biasFileSize);
+    aclrtMalloc((void **)&inputBiasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    ReadFile("./input/bias.bin", biasFileSize, inputBiasHost, biasFileSize);
+    aclrtMemcpy(inputBiasDevice, biasFileSize, inputBiasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    uint8_t *tilingHost;
+    uint8_t *tilingDevice;
+    aclrtMallocHost((void **)(&tilingHost), tilingFileSize);
+    aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMemcpy(tilingHost, tilingFileSize, tilingBuf, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST);
+    aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    uint8_t *workspaceDevice;
+    aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+    matmul_leakyrelu_custom<<<blockDim, nullptr, stream>>>(inputADevice, inputBDevice, inputBiasDevice, outputCDevice,
+                                                           workspaceDevice, tilingDevice);
+
+    aclrtSynchronizeStream(stream);
+
+    aclrtFree(inputADevice);
+    aclrtFreeHost(inputAHost);
+    aclrtFree(inputBDevice);
+    aclrtFreeHost(inputBHost);
+    aclrtMemcpy(outputCHost, cFileSize, outputCDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST);
+    WriteFile("./output/output.bin", outputCHost, cFileSize);
+    aclrtFree(outputCDevice);
+    aclrtFreeHost(outputCHost);
+    aclrtFree(inputBiasDevice);
+    aclrtFreeHost(inputBiasHost);
+    aclrtFree(tilingDevice);
+    aclrtFreeHost(tilingHost);
+    aclrtFree(workspaceDevice);
+
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+    free(tilingBuf);
+    return 0;
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
new file mode 100755
index 000000000..be900eb02
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
@@ -0,0 +1,8 @@
+rm -rf build
+mkdir -p build
+cd build
+cmake ..
+make -j
+python3 ../scripts/gen_data.py
+./demo
+python3 ../scripts/verify_result.py output/output.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py
new file mode 100644
index 000000000..e03d4359c
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import os
+
+
+def gen_golden_data():
+    M = 1024
+    N = 640
+    K = 256
+
+    input_a = np.random.randint(1, 10, [M, K]).astype(np.float16)
+    input_b = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    input_bias = np.random.randint(1, 10, [N]).astype(np.float32)
+    alpha = 0.001
+    golden = (np.matmul(input_a.astype(np.float32), input_b.astype(np.float32)) + input_bias).astype(np.float32)
+    golden = np.where(golden >= 0, golden, golden * alpha)
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    input_a.tofile("./input/x1_gm.bin")
+    input_b.tofile("./input/x2_gm.bin")
+    input_bias.tofile("./input/bias.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data()
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py
new file mode 100644
index 000000000..7a7e27ffa
--- /dev/null
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py
@@ -0,0 +1,55 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-6
+absolute_tol = 1e-9
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    print("golden : ", golden)
+    print("output : ", output)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
-- 
Gitee


From d8a10e40b7d8849cf93ef9a9a8f7338373140271 Mon Sep 17 00:00:00 2001
From: ApeiriaNode_Booker <renjie88@huawei.com>
Date: Mon, 22 Sep 2025 14:27:12 +0800
Subject: [PATCH 2/2] add simple pybind samples

---
 .../24_simple_hello_world/CMakeLists.txt      |   6 +-
 .../24_simple_hello_world/README.md           |   5 +-
 .../{hello_world.cpp => hello_world.asc}      |   2 +-
 .../25_simple_add/CMakeLists.txt              |   6 +-
 .../0_introduction/25_simple_add/README.md    |   5 +-
 .../{add_custom.cpp => add_custom.asc}        |   2 +-
 .../26_simple_matmulleakyrelu/CMakeLists.txt  |   6 +-
 .../26_simple_matmulleakyrelu/README.md       |   5 +-
 ...mul_leakyrelu.cpp => matmul_leakyrelu.asc} |   2 +-
 .../26_simple_matmulleakyrelu/run.sh          |   8 +-
 .../CMakeLists.txt                            |  63 ++++++++++
 .../27_simple_add_cpp_extensions/README.md    | 102 ++++++++++++++++
 .../add_custom.asc                            | 111 ++++++++++++++++++
 .../add_custom_test.py                        |  38 ++++++
 operator/ascendc/0_introduction/README.md     |   7 +-
 15 files changed, 334 insertions(+), 34 deletions(-)
 rename operator/ascendc/0_introduction/24_simple_hello_world/{hello_world.cpp => hello_world.asc} (97%)
 rename operator/ascendc/0_introduction/25_simple_add/{add_custom.cpp => add_custom.asc} (99%)
 rename operator/ascendc/0_introduction/26_simple_matmulleakyrelu/{matmul_leakyrelu.cpp => matmul_leakyrelu.asc} (99%)
 create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md
 create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc
 create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py

diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
index 590f26516..559929405 100644
--- a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
@@ -6,10 +6,6 @@ find_package(ASC REQUIRED)
 
 project(kernel_samples LANGUAGES ASC CXX)
 
-set_source_files_properties(
-    hello_world.cpp PROPERTIES LANGUAGE ASC
-)
-
 add_executable(demo
-    hello_world.cpp
+    hello_world.asc
 )
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
index 20c1ee4c8..ddcaf96e9 100644
--- a/operator/ascendc/0_introduction/24_simple_hello_world/README.md
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
@@ -5,13 +5,12 @@
 ```
 ├── 24_simple_helloworld
 │   ├── CMakeLists.txt      // 编译工程文件
-│   └── hello_world.cpp     // 算子实现及测试
+│   └── hello_world.asc     // AscendC算子实现 & 调用样例
 ```
 
 ## 支持的产品型号
 本样例支持如下产品型号：
-- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
-- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品
 
 ## 运行样例算子
   - 打开样例目录
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc
similarity index 97%
rename from operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
rename to operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc
index cb28dec3d..e67f32663 100644
--- a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc
@@ -1,5 +1,5 @@
 /**
- * @file hello_world.cpp
+ * @file hello_world.asc
  *
  * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
  *
diff --git a/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt
index b3e88f157..08689321b 100644
--- a/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt
+++ b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt
@@ -6,10 +6,6 @@ find_package(ASC REQUIRED)
 
 project(kernel_samples LANGUAGES ASC CXX)
 
-set_source_files_properties(
-    add_custom.cpp PROPERTIES LANGUAGE ASC
-)
-
 add_executable(demo
-    add_custom.cpp
+    add_custom.asc
 )
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/25_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md
index 6f6600a15..470c3edef 100644
--- a/operator/ascendc/0_introduction/25_simple_add/README.md
+++ b/operator/ascendc/0_introduction/25_simple_add/README.md
@@ -5,7 +5,7 @@
 ```
 ├── 25_simple_add
 │   ├── CMakeLists.txt      // 编译工程文件
-│   └── add_custom.cpp      // 算子实现及测试
+│   └── add_custom.asc      // AscendC算子实现 & 调用样例
 ```
 
 ## 算子描述
@@ -41,8 +41,7 @@ z = x + y
 
 ## 支持的产品型号
 本样例支持如下产品型号：
-- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
-- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品
 
 ## 运行样例算子
   - 打开样例目录   
diff --git a/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/25_simple_add/add_custom.asc
similarity index 99%
rename from operator/ascendc/0_introduction/25_simple_add/add_custom.cpp
rename to operator/ascendc/0_introduction/25_simple_add/add_custom.asc
index d2b5cb112..544e134f3 100644
--- a/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp
+++ b/operator/ascendc/0_introduction/25_simple_add/add_custom.asc
@@ -1,5 +1,5 @@
 /**
- * @file add_custom.cpp
+ * @file add_custom.asc
  *
  * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
  *
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
index c833ed1fd..2958d3a02 100644
--- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt
@@ -6,12 +6,8 @@ find_package(ASC REQUIRED)
 
 project(kernel_samples LANGUAGES ASC CXX)
 
-set_source_files_properties(
-    matmul_leakyrelu.cpp PROPERTIES LANGUAGE ASC
-)
-
 add_executable(demo
-    matmul_leakyrelu.cpp
+    matmul_leakyrelu.asc
 )
 
 target_link_libraries(demo PRIVATE
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
index a8b9dbd70..da5b62293 100644
--- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md
@@ -7,7 +7,7 @@
 ├── 26_simple_matmulleakyrelu
 │   ├── CMakeLists.txt          // 编译工程文件
 │   ├── data_utils.h            // 数据读入写出函数
-│   ├── matmul_leakyrelu.cpp    // 算子实现及测试
+│   ├── matmul_leakyrelu.asc    // AscendC算子实现 & 调用样例
 │   ├── run.sh                  // 运行脚本
 │   └── scripts
 │       ├── gen_data.py         // 输入数据和真值数据生成脚本文件
@@ -58,8 +58,7 @@ C = C > 0 ? C : C * 0.001
 
 ## 支持的产品型号
 本样例支持如下产品型号：
-- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件
-- Atlas A3 训练系列产品/Atlas A3 推理系列产品
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品
 
 ## 运行样例算子
   - 打开样例目录   
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc
similarity index 99%
rename from operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp
rename to operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc
index 6c1e6136a..430d62328 100644
--- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc
@@ -1,5 +1,5 @@
 /**
- * @file matmul_leakyrelu_custom.cpp
+ * @file matmul_leakyrelu_custom.asc
  *
  * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
  *
diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
index be900eb02..fe03bb21f 100755
--- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
+++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh
@@ -1,8 +1,6 @@
-rm -rf build
-mkdir -p build
-cd build
-cmake ..
-make -j
+#!/bin/bash
+rm -rf build; mkdir -p build; cd build
+cmake ..; make -j
 python3 ../scripts/gen_data.py
 ./demo
 python3 ../scripts/verify_result.py output/output.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt
new file mode 100644
index 000000000..96c61cc34
--- /dev/null
+++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+execute_process(COMMAND python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE TORCH_PATH
+)
+message("TORCH_PATH is ${TORCH_PATH}")
+
+execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE TORCH_NPU_PATH
+)
+message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+
+execute_process(COMMAND python3 -m pybind11 --includes
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE PYBIND11_INC
+)
+string(REPLACE " " ";" PYBIND11_INC ${PYBIND11_INC})
+
+execute_process(COMMAND python3-config --extension-suffix
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE PYBIND11_SUFFIX
+)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+add_library(pybind11_lib SHARED
+    add_custom.asc
+)
+
+target_link_libraries(pybind11_lib PRIVATE
+    torch_npu
+)
+
+target_link_directories(pybind11_lib PRIVATE
+    ${TORCH_PATH}/lib
+    ${TORCH_NPU_PATH}/lib
+)
+
+target_include_directories(pybind11_lib PRIVATE
+    ${TORCH_NPU_PATH}/include
+    ${TORCH_PATH}/include
+    ${TORCH_PATH}/include/torch/csrc/api/include
+)
+
+target_compile_definitions(pybind11_lib PRIVATE
+    _GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_options(pybind11_lib PRIVATE
+    ${PYBIND11_INC}
+    -fPIC
+)
+
+set_target_properties(pybind11_lib PROPERTIES
+    OUTPUT_NAME add_custom${PYBIND11_SUFFIX}
+    PREFIX "" SUFFIX ""
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md
new file mode 100644
index 000000000..3769caeb1
--- /dev/null
+++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md
@@ -0,0 +1,102 @@
+## 简化Pybind算子直调样例
+本样例使用pybind方式调用核函数，以带有Tiling的Add算子为示例，展示了一种更为简单的算子编译流程，支持main函数和Kernel函数在同一个cpp文件中实现。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+
+## 目录结构介绍
+```
+├── 27_simple_add_cpp_extensions
+│   ├── CMakeLists.txt        // 编译工程文件
+│   ├── add_custom_test.py    // python调用脚本
+│   ├── add_custom.asc        // AscendC算子实现 & Pybind封装
+│   └── run.sh                // 编译运行算子的脚本
+```
+## 代码实现介绍
+- kernel实现  
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.asc](./add_custom.asc)。
+
+- 调用实现  
+  通过PyTorch框架进行模型的训练、推理时，会调用到很多算子进行计算，调用方式也和kernel编译流程相关。对于自定义算子工程，需要使用PyTorch Ascend Adapter中的OP-Plugin算子插件对功能进行扩展，让torch可以直接调用自定义算子包中的算子；对于简化KernelLaunch开放式算子编程的方式，也可以使用pytorch调用，此样例演示的就是这种算子调用方式。
+
+  pybind11.cpp文件是一个C++的代码示例，使用了pybind11库来将C++代码封装成Python模块。该代码实现中定义了一个名为m的pybind11模块，其中包含一个名为run_add_custom的函数。该函数与my_add::run_add_custom函数相同，用于将C++函数转成Python函数。在函数实现中，通过c10_npu::getCurrentNPUStream() 的函数获取当前NPU上的流，通过内核调用符<<<>>>调用自定义的Kernel函数add_custom，在NPU上执行算子。
+
+  在add_custom_test.py调用脚本中，通过导入自定义模块add_custom，调用自定义模块add_custom中的run_add_custom函数，在NPU上执行x和y的加法操作，并将结果保存在变量z中。
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2 训练系列产品/Atlas 800I A2 推理产品
+
+## 运行样例算子
+  - 安装pytorch (这里使用2.1.0版本为例)
+
+    **aarch64:**
+
+    ```bash
+    pip3 install torch==2.1.0
+    ```
+
+    **x86:**
+
+    ```bash
+    pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
+    ```
+
+  - 安装torch-npu （以Pytorch2.1.0、python3.9、CANN版本8.0.RC1.alpha002为例）
+
+    ```bash
+    git clone https://gitee.com/ascend/pytorch.git -b v6.0.rc1.alpha002-pytorch2.1.0
+    cd pytorch/
+    bash ci/build.sh --python=3.9
+    pip3 install dist/*.whl
+    ```
+
+    安装pybind11
+    ```bash
+    pip3 install pybind11
+    ```
+
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/27_simple_add_cpp_extensions
+    ```
+
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+
+  - 样例执行
+    ```bash
+    rm -rf build; mkdir -p build; cd build  # 创建并进入build目录
+    cmake ..; make -j                       # 编译算子so
+    python3 ../add_custom_test.py           # 执行样例
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/22 | 新增本readme |
diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc
new file mode 100644
index 000000000..d4a076832
--- /dev/null
+++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc
@@ -0,0 +1,111 @@
+/**
+ * @file add_custom.asc
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "kernel_operator.h"
+
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = 8;
+        this->tileLength = this->blockLength / this->tileNum / BUFFER_NUM;
+        xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<half>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<half> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+
+__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength)
+{
+    KernelAdd op;
+    op.Init(x, y, z, totalLength);
+    op.Process();
+}
+
+namespace my_add {
+at::Tensor run_add_custom(const at::Tensor &x, const at::Tensor &y)
+{
+    auto aclStream = c10_npu::getCurrentNPUStream().stream(false);
+    at::Tensor z = at::empty_like(x);
+    uint32_t blockDim = 8;
+    uint32_t totalLength = 1;
+    for (uint32_t size : x.sizes()) {
+        totalLength *= size;
+    }
+    auto xGm = static_cast<uint8_t *>(const_cast<void *>(x.storage().data()));
+    auto yGm = static_cast<uint8_t *>(const_cast<void *>(y.storage().data()));
+    auto zGm = static_cast<uint8_t *>(const_cast<void *>(z.storage().data()));
+    add_custom<<<blockDim, nullptr, aclStream>>>(xGm, yGm, zGm, totalLength);
+    return z;
+}
+} // namespace my_add
+
+PYBIND11_MODULE(add_custom, m)
+{
+    m.doc() = "add_custom pybind11 interfaces"; // optional module docstring
+    m.def("run_add_custom", &my_add::run_add_custom, "");
+}
diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py
new file mode 100644
index 000000000..b5d63dee7
--- /dev/null
+++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+import sys, os
+
+sys.path.append(os.getcwd())
+import add_custom
+
+torch.npu.config.allow_internal_format = False
+
+
+class TestCustomAdd(TestCase):
+
+    def test_add_custom_ops(self):
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float16)
+        y = torch.rand(length, device='cpu', dtype=torch.float16)
+
+        x_npu = x.npu()
+        y_npu = y.npu()
+        output = add_custom.run_add_custom(x_npu, y_npu)
+        cpuout = torch.add(x, y)
+
+        self.assertRtolEqual(output, cpuout)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/operator/ascendc/0_introduction/README.md b/operator/ascendc/0_introduction/README.md
index 44a722d77..0b9383e71 100644
--- a/operator/ascendc/0_introduction/README.md
+++ b/operator/ascendc/0_introduction/README.md
@@ -38,7 +38,10 @@
 | [21_vectoradd_kernellaunch](./21_vectoradd_kernellaunch) | 基于Ascend C的Add多场景自定义Vector算子的KernelLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 | [22_baremix_kernellaunch](./22_baremix_kernellaunch) | 通过更底层的编码方式，实现MatmulLeayrelu融合算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 | [23_static_tensor_programming_kernellaunch](./23_static_tensor_programming_kernellaunch) | 通过静态Tensor编程方式，实现Add算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品
-
+| [24_simple_hello_world](./24_simple_hello_world) | Ascend C异构混合编程样例, 实现Hello World算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品
+| [25_simple_add](./25_simple_add) | Ascend C异构混合编程样例, 实现Add自定义Vector算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品
+| [26_simple_matmulleakyrelu](./26_simple_matmulleakyrelu) | Ascend C异构混合编程样例, 实现MatmulLeakyRelu自定义Cube+Vector算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品
+| [27_simple_add_cpp_extensions](./27_simple_add_cpp_extensions) | Ascend C异构混合编程样例, 实现Add自定义Vector算子动态库及pybind调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品
 ## 获取样例代码<a name="codeready"></a>
 
  可以使用以下两种方式下载，请选择其中一种进行源码准备。
@@ -72,4 +75,4 @@
 | 2025/01/06 | 新增21_vectoradd_kernellaunch样例  |
 | 2025/07/22 | 新增8_library_frameworklaunch样例       |
 | 2025/7/28 | 新增22_baremix_kernellaunch                   |
-
+| 2025/9/22 | 新增Ascend C异构混合编程样例24-27                   |
\ No newline at end of file
-- 
Gitee