diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3e88f1572b89f6f9d2a2862accd54baf0951847
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+set_source_files_properties(
+    add_custom.cpp PROPERTIES LANGUAGE ASC
+)
+
+add_executable(demo
+    add_custom.cpp
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/23_simple_add/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b3dce29fda7257aabe0a2bdee6b8c0d806faefb
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/README.md
@@ -0,0 +1,86 @@
+## 简化Add算子直调样例
+本样例以Add算子为示例，展示了一种更为简单的算子编译流程，支持main函数和Kernel函数在同一个cpp文件中实现。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+## 目录结构介绍
+```
+├── 23_simple_add
+│   ├── CMakeLists.txt      // 编译工程文件
+│   └── add_custom.cpp      // 算子实现及测试
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：  
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 代码实现介绍
+- kernel实现  
+  Add算子的数学表达式为：
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+- tiling实现  
+  TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数，本示例算子使用了2个tiling参数：totalLength、tileNum。totalLength是指需要计算的数据量大小，tileNum是指每个核上总计算数据分块个数。比如，totalLength这个参数传递到kernel侧后，可以通过除以参与计算的核数，得到每个核上的计算量，这样就完成了多核数据的切分。
+
+- 调用实现  
+  使用内核调用符<<<>>>调用核函数。
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+
+  - 样例执行
+    ```bash
+    mkdir -p build && cd build;   # 创建并进入build目录
+    cmake ..;make -j;             # 编译工程
+    ./demo                        # 执行样例
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/15 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2b5cb112b1457f6a9a007b66ff3683e0dc09704
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
@@ -0,0 +1,180 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include "acl/acl.h"
+#include "kernel_operator.h"
+
+constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue
+
+struct AddCustomTilingData
+{
+    uint32_t totalLength;
+    uint32_t tileNum;
+};
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = tileNum;
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+
+__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+    KernelAdd op;
+    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
+    op.Process();
+}
+
+std::vector<float> kernel_add(std::vector<float> &x, std::vector<float> &y)
+{
+    constexpr uint32_t blockDim = 8;
+    uint32_t totalLength = x.size();
+    size_t totalByteSize = totalLength * sizeof(float);
+    int32_t deviceId = 0;
+    aclrtStream stream = nullptr;
+    AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8};
+    uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
+    uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
+    uint8_t *zHost = nullptr;
+    uint8_t *xDevice = nullptr;
+    uint8_t *yDevice = nullptr;
+    uint8_t *zDevice = nullptr;
+
+    aclInit(nullptr);
+    aclrtSetDevice(deviceId);
+    aclrtCreateStream(&stream);
+
+    aclrtMallocHost((void **)(&zHost), totalByteSize);
+    aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+    aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
+    aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    add_custom<<<blockDim, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
+    aclrtSynchronizeStream(stream);
+
+    aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
+    std::vector<float> z((float *)zHost, (float *)(zHost + totalLength));
+
+    aclrtFree(xDevice);
+    aclrtFree(yDevice);
+    aclrtFree(zDevice);
+    aclrtFreeHost(zHost);
+
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+
+    return z;
+}
+
+uint32_t VerifyResult(std::vector<float> &output, std::vector<float> &golden)
+{
+    auto printTensor = [](std::vector<float> &tensor, const char *name) {
+        constexpr size_t maxPrintSize = 20;
+        std::cout << name << ": ";
+        std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize),
+            std::ostream_iterator<float>(std::cout, " "));
+        if (tensor.size() > maxPrintSize) {
+            std::cout << "...";
+        }
+        std::cout << std::endl;
+    };
+    printTensor(output, "Output");
+    printTensor(golden, "Golden");
+    if (std::equal(output.begin(), output.end(), golden.begin())) {
+        std::cout << "[Success] Case accuracy is verification passed." << std::endl;
+        return 0;
+    } else {
+        std::cout << "[Failed] Case accuracy is verification failed!" << std::endl;
+        return 1;
+    }
+    return 0;
+}
+
+int32_t main(int32_t argc, char *argv[])
+{
+    constexpr uint32_t totalLength = 8 * 2048;
+    constexpr float valueX = 1.2f;
+    constexpr float valueY = 2.3f;
+    std::vector<float> x(totalLength, valueX);
+    std::vector<float> y(totalLength, valueY);
+
+    std::vector<float> output = kernel_add(x, y);
+
+    std::vector<float> golden(totalLength, valueX + valueY);
+    return VerifyResult(output, golden);
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..590f26516ae6a3fb1703bb7741bb3c9c19b15651
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+set_source_files_properties(
+    hello_world.cpp PROPERTIES LANGUAGE ASC
+)
+
+add_executable(demo
+    hello_world.cpp
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..53ab1d8cc81ef6441bec25928132f2512a5df9a0
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
@@ -0,0 +1,54 @@
+## 简化HelloWorld算子直调样例
+本样例通过使用<<<>>>内核调用符来完成算子核函数在NPU侧运行验证的基础流程，核函数内通过printf打印输出结果。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+## 目录结构介绍
+```
+├── 24_simple_helloworld
+│   ├── CMakeLists.txt      // 编译工程文件
+│   └── hello_world.cpp     // 算子实现及测试
+```
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 运行样例算子
+  - 打开样例目录
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+  - 样例执行
+    ```bash
+    mkdir -p build && cd build;   # 创建并进入build目录
+    cmake ..;make -j;             # 编译工程
+    ./demo                        # 执行样例
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/15 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb28dec3d6dd069cc9d1bbfd2484045839d9e3f5
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
@@ -0,0 +1,35 @@
+/**
+ * @file hello_world.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "acl/acl.h"
+
+__global__ __aicore__ void hello_world()
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY);
+    AscendC::printf("Hello World!!!\n");
+}
+
+int32_t main(int argc, char const *argv[])
+{
+    aclInit(nullptr);
+    int32_t deviceId = 0;
+    aclrtSetDevice(deviceId);
+    aclrtStream stream = nullptr;
+    aclrtCreateStream(&stream);
+
+    constexpr uint32_t blockDim = 1;
+    hello_world<<<blockDim, nullptr, stream>>>();
+    aclrtSynchronizeStream(stream);
+
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+    return 0;
+}
\ No newline at end of file

算子类型(OpType)	Add
算子输入	name	shape	data type	format
	x	8 * 2048	float	ND
	y	8 * 2048	float	ND
算子输出	z	8 * 2048	float	ND
核函数名	add_custom