From 7a634125028d7f7ed9e60970b480a67ed25419d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E9=99=86?= <liulu92@huawei.com>
Date: Mon, 15 Sep 2025 11:47:35 +0000
Subject: [PATCH 1/7] =?UTF-8?q?!2762=20uniquecust=20modify=20infershape=20?=
 =?UTF-8?q?Merge=20pull=20request=20!2762=20from=20=E5=88=98=E9=99=86/mast?=
 =?UTF-8?q?er?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../4_op_dev/1_custom_op/op_proto/unique_cust.h     | 13 +++++++++++++
 1 file changed, 13 insertions(+)
diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h b/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h
index 6d18e6739..5eee4e1d2 100644
--- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h
+++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h
@@ -19,7 +19,20 @@
 #include "graph/operator_reg.h"
 
 namespace ge {
+/**
+*@brief Finds unique elements in a 1D tensor. \n
 
+*@par Inputs:
+*x: 1D tensor. Support all types mentioned in TensorType.
+*Input "x" is a k-dimensional tensor. \n
+
+*@par Attributes:
+*out_idx: An optional DType from: "int32, int64". Defaults to "int32". \n
+
+*@par Outputs:
+*@li y: "x" in the unique output "y".
+*@li idx: A tensor the same size as "x". The index of each value of "x". \n
+*/
 REG_OP(UniqueCust)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \
            DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE}))
-- 
Gitee


From 6bf58b5ca3bc8d072ed04eb83853125561eefa33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AE=81?= <lining.li@huawei.com>
Date: Mon, 15 Sep 2025 11:59:49 +0000
Subject: [PATCH 2/7] =?UTF-8?q?!2765=20sync=20pydflow=20code=20Merge=20pul?=
 =?UTF-8?q?l=20request=20!2765=20from=20=E6=9D=8E=E5=AE=81/dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 inference/dataflow/py_dflow/CMakeLists.txt    |  2 +
 inference/dataflow/py_dflow/build.sh          | 21 ++++-
 .../py_dflow/cmake/intf_pub_linux.cmake       |  7 +-
 .../py_dflow/cmake/modules/Finddflow.cmake    | 85 +++++++++++++++++++
 .../py_dflow/cmake/modules/Findmetadef.cmake  | 23 +----
 .../py_dflow/python/dataflow/dataflow.py      |  9 +-
 .../python/dataflow/tools/tpl/tpl_cmake.py    |  2 +-
 .../dataflow/py_dflow/wrapper/CMakeLists.txt  | 15 ++--
 .../wrapper/flow_func_wrapper/CMakeLists.txt  |  8 +-
 9 files changed, 134 insertions(+), 38 deletions(-)
 create mode 100644 inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake

diff --git a/inference/dataflow/py_dflow/CMakeLists.txt b/inference/dataflow/py_dflow/CMakeLists.txt
index 5323a373d..69cf67fa3 100644
--- a/inference/dataflow/py_dflow/CMakeLists.txt
+++ b/inference/dataflow/py_dflow/CMakeLists.txt
@@ -39,12 +39,14 @@ if (BUILD_OPEN_PROJECT)
     include(cmake/test_funcs.cmake)
     include(cmake/intf_pub_linux.cmake)
     include(cmake/modules/Findair.cmake)
+    include(cmake/modules/Finddflow.cmake)
     include(cmake/modules/Findmetadef.cmake)
     include(cmake/modules/Findparser.cmake)
 
     # 自研软件包
     find_package(slog MODULE REQUIRED)
     find_package(air MODULE REQUIRED)
+    find_package(dflow MODULE REQUIRED)
     find_package(metadef MODULE REQUIRED)
     find_package(parser MODULE REQUIRED)
     find_package(udf MODULE REQUIRED)
diff --git a/inference/dataflow/py_dflow/build.sh b/inference/dataflow/py_dflow/build.sh
index b236b2872..87b29aecc 100755
--- a/inference/dataflow/py_dflow/build.sh
+++ b/inference/dataflow/py_dflow/build.sh
@@ -23,12 +23,15 @@ BUILD_RELATIVE_PATH="build"
 # print usage message
 usage() {
   echo "Usage:"
-  echo "  sh build.sh [-h | --help] [-v | --verbose] [-j<N>] [--ascend_install_path=<PATH>] [--output_path=<PATH>] [--python_path=<PATH>]"
+  echo "  sh build.sh [-h | --help] [-v | --verbose] [-j<N>] [--build_type=<Release|Debug>]"
+  echo "              [--ascend_install_path=<PATH>] [--output_path=<PATH>] [--python_path=<PATH>]"
   echo ""
   echo "Options:"
   echo "    -h, --help     Print usage"
   echo "    -v, --verbose  Display build command"
   echo "    -j<N>          Set the number of threads used for building DFlow, default is 8"
+  echo "    --build_type=<Release|Debug>"
+  echo "                   Set build type, default Release"
   echo "    --ascend_install_path=<PATH>"
   echo "                   Set ascend package install path, default /usr/local/Ascend/ascend-toolkit/latest"
   echo "    --output_path=<PATH>"
@@ -38,6 +41,17 @@ usage() {
   echo ""
 }
 
+# check value of build_type option
+# usage: check_build_type build_type
+check_build_type() {
+  arg_value="$1"
+  if [ "X$arg_value" != "XRelease" ] && [ "X$arg_value" != "XDebug" ]; then
+    echo "Invalid value $arg_value for option --$2"
+    usage
+    exit 1
+  fi
+}
+
 # parse and set options
 checkopts() {
   VERBOSE=""
@@ -67,6 +81,11 @@ checkopts() {
         VERBOSE="VERBOSE=1"
         shift
         ;;
+      --build_type)
+        check_build_type "$2" build_type
+        CMAKE_BUILD_TYPE="$2"
+        shift 2
+        ;;
       --ascend_install_path)
         ASCEND_INSTALL_PATH="$(realpath $2)"
         shift 2
diff --git a/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake b/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake
index 4c9b502e9..6561a3ac6 100644
--- a/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake
+++ b/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake
@@ -9,13 +9,12 @@ target_compile_options(intf_pub INTERFACE
     -Wall 
     -fPIC 
     $<IF:$<STREQUAL:${CMAKE_SYSTEM_NAME},centos>,-fstack-protector-all,-fstack-protector-strong>
-    $<$<COMPILE_LANGUAGE:CXX>:-std=c++11>    
+    $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
 )
 target_compile_definitions(intf_pub INTERFACE
-    _GLIBCXX_USE_CXX11_ABI=0 
+    _GLIBCXX_USE_CXX11_ABI=0
     $<$<CONFIG:Release>:CFG_BUILD_NDEBUG>
-    $<$<CONFIG:Debug>:CFG_BUILD_DEBUG>   
-    WIN64=1
+    $<$<CONFIG:Debug>:CFG_BUILD_DEBUG>
     LINUX=0
     LOG_CPP
 )
diff --git a/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake b/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake
new file mode 100644
index 000000000..aa51a2793
--- /dev/null
+++ b/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake
@@ -0,0 +1,85 @@
+if (dflow_FOUND)
+    message(STATUS "Package dflow has been found.")
+    return()
+endif()
+
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS flow_graph dflow_headers)
+    list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+    if(TARGET "${_cmake_expected_target}")
+        list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+    else()
+        list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+    endif()
+endforeach()
+unset(_cmake_expected_target)
+
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+    unset(_cmake_targets_defined)
+    unset(_cmake_targets_not_defined)
+    unset(_cmake_expected_targets)
+    unset(CMAKE_IMPORT_FILE_VERSION)
+    cmake_policy(POP)
+    return()
+endif()
+
+if(NOT _cmake_targets_defined STREQUAL "")
+    string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+    string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+    message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+find_path(_INCLUDE_DIR
+    NAMES flow_graph/data_flow.h
+    NO_CMAKE_SYSTEM_PATH
+    NO_CMAKE_FIND_ROOT_PATH)
+
+find_library(flow_graph_SHARED_LIBRARY
+    NAMES libflow_graph.so
+    PATH_SUFFIXES lib64
+    NO_CMAKE_SYSTEM_PATH
+    NO_CMAKE_FIND_ROOT_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(dflow
+    FOUND_VAR
+        dflow_FOUND
+    REQUIRED_VARS
+        _INCLUDE_DIR
+        flow_graph_SHARED_LIBRARY
+)
+
+if(dflow_FOUND)
+    set(dflow_INCLUDE_DIR "${_INCLUDE_DIR}")
+    include(CMakePrintHelpers)
+    message(STATUS "Variables in dflow module:")
+    cmake_print_variables(dflow_INCLUDE_DIR)
+    cmake_print_variables(flow_graph_SHARED_LIBRARY)
+
+    add_library(flow_graph SHARED IMPORTED)
+    set_target_properties(flow_graph PROPERTIES
+        INTERFACE_LINK_LIBRARIES "dflow_headers"
+        IMPORTED_LOCATION "${flow_graph_SHARED_LIBRARY}"
+    )
+
+    add_library(dflow_headers INTERFACE IMPORTED)
+    set_target_properties(dflow_headers PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${dflow_INCLUDE_DIR}"
+    )
+
+    include(CMakePrintHelpers)
+    cmake_print_properties(TARGETS flow_graph
+        PROPERTIES INTERFACE_LINK_LIBRARIES IMPORTED_LOCATION
+    )
+    cmake_print_properties(TARGETS dflow_headers
+        PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
+    )
+endif()
+
+# Cleanup temporary variables.
+set(_INCLUDE_DIR)
diff --git a/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake b/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake
index 750835bc7..67f9a3dfd 100644
--- a/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake
+++ b/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake
@@ -1,12 +1,12 @@
 if (metadef_FOUND)
-    message(STATUS "Package air has been found.")
+    message(STATUS "Package metadef has been found.")
     return()
 endif()
 
 set(_cmake_targets_defined "")
 set(_cmake_targets_not_defined "")
 set(_cmake_expected_targets "")
-foreach(_cmake_expected_target IN ITEMS flow_graph metadef_headers)
+foreach(_cmake_expected_target IN ITEMS metadef_headers)
     list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
     if(TARGET "${_cmake_expected_target}")
         list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
@@ -35,13 +35,7 @@ unset(_cmake_targets_not_defined)
 unset(_cmake_expected_targets)
 
 find_path(_INCLUDE_DIR
-    NAMES flow_graph/data_flow.h
-    NO_CMAKE_SYSTEM_PATH
-    NO_CMAKE_FIND_ROOT_PATH)
-
-find_library(flow_graph_SHARED_LIBRARY
-    NAMES libflow_graph.so
-    PATH_SUFFIXES lib64
+    NAMES graph/types.h
     NO_CMAKE_SYSTEM_PATH
     NO_CMAKE_FIND_ROOT_PATH)
 
@@ -51,7 +45,6 @@ find_package_handle_standard_args(metadef
         metadef_FOUND
     REQUIRED_VARS
         _INCLUDE_DIR
-        flow_graph_SHARED_LIBRARY
 )
 
 if(metadef_FOUND)
@@ -59,13 +52,6 @@ if(metadef_FOUND)
     include(CMakePrintHelpers)
     message(STATUS "Variables in metadef module:")
     cmake_print_variables(metadef_INCLUDE_DIR)
-    cmake_print_variables(flow_graph_SHARED_LIBRARY)
-
-    add_library(flow_graph SHARED IMPORTED)
-    set_target_properties(flow_graph PROPERTIES
-        INTERFACE_LINK_LIBRARIES "metadef_headers"
-        IMPORTED_LOCATION "${flow_graph_SHARED_LIBRARY}"
-    )
 
     add_library(metadef_headers INTERFACE IMPORTED)
     set_target_properties(metadef_headers PROPERTIES
@@ -73,9 +59,6 @@ if(metadef_FOUND)
     )
 
     include(CMakePrintHelpers)
-    cmake_print_properties(TARGETS flow_graph
-        PROPERTIES INTERFACE_LINK_LIBRARIES IMPORTED_LOCATION
-    )
     cmake_print_properties(TARGETS metadef_headers
         PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
     )
diff --git a/inference/dataflow/py_dflow/python/dataflow/dataflow.py b/inference/dataflow/py_dflow/python/dataflow/dataflow.py
index 73d70805a..8b4fb177f 100644
--- a/inference/dataflow/py_dflow/python/dataflow/dataflow.py
+++ b/inference/dataflow/py_dflow/python/dataflow/dataflow.py
@@ -1286,15 +1286,16 @@ class FlowGraph(object):
                 "for details about the error information, see the ascend log.",
                 dwrapper.INNER_ERROR,
             )
+        flow_info.start_time = ret[2].start_time
+        flow_info.end_time = ret[2].end_time
+        flow_info.flow_flags = ret[2].flow_flags
+        flow_info.transaction_id = ret[2].transaction_id
         if ret[0].ret_code != 0 and ret[0].ret_code != dwrapper.SUBHEALTHY:
             log.error("failed to fetch data, error msg = %s", ret[0].error_msg)
+            return (outputs, flow_info, ret[0].ret_code)
 
         for output in ret[1]:
             outputs.append(Tensor(output))
-        flow_info.start_time = ret[2].start_time
-        flow_info.end_time = ret[2].end_time
-        flow_info.flow_flags = ret[2].flow_flags
-        flow_info.transaction_id = ret[2].transaction_id
         return (outputs, flow_info, ret[0].ret_code)
 
     def feed(
diff --git a/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py b/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py
index 7c4beeab7..e1f727e19 100644
--- a/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py
+++ b/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py
@@ -93,7 +93,7 @@ add_library(${UDF_TARGET_LIB} SHARED
 
 target_compile_options(${UDF_TARGET_LIB} PRIVATE
   -O2
-  -std=c++11
+  -std=c++17
   -ftrapv
   -fstack-protector-all
   -fPIC
diff --git a/inference/dataflow/py_dflow/wrapper/CMakeLists.txt b/inference/dataflow/py_dflow/wrapper/CMakeLists.txt
index 87e5b99e4..920557436 100644
--- a/inference/dataflow/py_dflow/wrapper/CMakeLists.txt
+++ b/inference/dataflow/py_dflow/wrapper/CMakeLists.txt
@@ -29,14 +29,17 @@ set_target_properties(dflow_wrapper
     PREFIX ""
 )
 
+target_compile_definitions(dflow_wrapper PRIVATE
+    PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF
+)
+
 target_compile_options(dflow_wrapper PRIVATE
     -O2
-    -std=c++11
     -Xlinker -export-dynamic
 )
 
 target_link_options(dflow_wrapper PRIVATE
-    -s
+    $<$<CONFIG:Release>:-s>
 )
 
 project(data_wrapper)
@@ -47,6 +50,10 @@ target_include_directories(data_wrapper PRIVATE
     ${pybind11_INCLUDE_DIR}
 )
 
+target_compile_definitions(data_wrapper PRIVATE
+    PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF
+)
+
 target_link_libraries(data_wrapper PRIVATE
     $<BUILD_INTERFACE:intf_pub>
     $<BUILD_INTERFACE:udf_headers>
@@ -60,13 +67,11 @@ set_target_properties(data_wrapper
 
 target_compile_options(data_wrapper PRIVATE
     -O2
-    -std=c++11
-    -s
     -Xlinker -export-dynamic
 )
 
 target_link_options(data_wrapper PRIVATE
-    -s
+    $<$<CONFIG:Release>:-s>
 )
 
 add_subdirectory(flow_func_wrapper)
diff --git a/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt b/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt
index c4b3e35cd..64800eba6 100644
--- a/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt
+++ b/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt
@@ -23,13 +23,15 @@ set_target_properties(flowfunc_wrapper
     PREFIX ""
 )
 
+target_compile_definitions(flowfunc_wrapper PRIVATE
+    PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF
+)
+
 target_compile_options(flowfunc_wrapper PRIVATE
     -O2
-    -std=c++11
-    -s
     -Xlinker -export-dynamic
 )
 
 target_link_options(flowfunc_wrapper PRIVATE
-    -s
+    $<$<CONFIG:Release>:-s>
 )
\ No newline at end of file
-- 
Gitee


From 408eccf6434fe07c606b47646bd5db489b2fc40b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=B5=E6=99=BA=E6=85=A7?= <zhaozhihui5@huawei.com>
Date: Tue, 16 Sep 2025 06:58:26 +0000
Subject: [PATCH 3/7] =?UTF-8?q?!2759=20add=20xPyD=20sample=20Merge=20pull?=
 =?UTF-8?q?=20request=20!2759=20from=20=E8=B5=B5=E6=99=BA=E6=85=A7/zzh=5Fa?=
 =?UTF-8?q?dd=5Fllm=5Fdatadist?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../10_llm_data_dist/README.md                |  10 ++
 .../pull_blocks_xpyd_sample.py                | 150 ++++++++++++++++++
 .../push_blocks_sample.py                     |   7 +-
 3 files changed, 164 insertions(+), 3 deletions(-)
 create mode 100644 python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py

diff --git a/python/level1_single_api/10_llm_data_dist/README.md b/python/level1_single_api/10_llm_data_dist/README.md
index 6dc0a99d7..8b1a3f6b7 100644
--- a/python/level1_single_api/10_llm_data_dist/README.md
+++ b/python/level1_single_api/10_llm_data_dist/README.md
@@ -122,5 +122,15 @@
       # Decoder主机:
       GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python switch_role_sample.py --device_id 1 --role d --local_host_ip 10.170.10.1 --remote_host_ip 10.170.10.0
       ```
+     - pull_blocks_xpyd_sample.py：此样例程序支持xPyD测试场景，使用单侧建链方式，每个进程申请内存并注册blocks, 每个decoder和所有的prompt发起建链, 并pull blocks到本地，local_ip_port指定本地host ip和端口，
+      分别在Prompt主机与Decoder主机，执行样例程序：
+      ```
+      # 任意个Prompt主机:
+      GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 0 --role p --local_ip_port 10.170.10.0:26000
+      GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 1 --role p --local_ip_port 10.170.10.0:26001
+      # 任意个Decoder主机:
+      GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 2 --role d --local_ip_port 10.170.10.0:26002 --remote_ip_port '10.170.10.0:26000;10.170.10.0:26001'
+      GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 3 --role d --local_ip_port 10.170.10.0:26003 --remote_ip_port '10.170.10.0:26000;10.170.10.0:26001'
+      ```
     **注**：**GLOO_SOCKET_IFNAME**为本地网卡名，可通过ifconfig查询；**HCCL_INTRA_ROCE_ENABLE=1**代表使用roce方式进行通信；
 
diff --git a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py
new file mode 100644
index 000000000..aeaf76871
--- /dev/null
+++ b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py
@@ -0,0 +1,150 @@
+"""
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+import time
+import logging
+from llm_datadist import LLMDataDist, LLMRole, LLMConfig, CacheDesc, DataType, BlocksCacheKey, \
+    Placement, LLMClusterInfo, LLMStatusCode
+import torch
+import torch_npu
+import torchair
+import socket
+import struct
+
+logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
+
+NUM_TENSORS = 1
+BLOCKS_NUM = 3
+KV_SHAPE = 10
+
+def ip_port_to_int(ip_port):
+    ip, port_str = ip_port.split(':')
+    port = int(port_str)
+    if not (0 <= port <= 65535):
+        raise ValueError("端口号必须在0-65535之间")
+    # 将IP转换为4字节二进制
+    ip_bytes = socket.inet_aton(ip)
+
+    # 将4字节IP转换为32位整数
+    ip_int = struct.unpack('!I', ip_bytes)[0]
+
+    # 组合IP整数(32位)和端口(16位)为一个48位整数
+    result = (ip_int << 16) | port
+    return result
+
+
+def init_llm_datadist(args) -> LLMDataDist:
+    datadist = LLMDataDist(role, ip_port_to_int(args.local_ip_port))
+    llm_config = LLMConfig()
+    llm_config.device_id = args.device_id
+    llm_config.local_comm_res = ""
+    if args.role == 'p':
+        llm_config.listen_ip_info = args.local_ip_port
+    llm_options = llm_config.generate_options()
+    datadist.init(llm_options)
+    logging.info(f"init {role} success, cluster_id={ip_port_to_int(args.local_ip_port)}")
+    return datadist
+
+
+def run_prompt_sample(datadist, args):
+    # 1. 注册内存
+    cache_manager = datadist.cache_manager
+    cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT,
+                           placement=Placement.DEVICE)
+    tensor = torch.full((BLOCKS_NUM, KV_SHAPE), ip_port_to_int(args.local_ip_port), dtype=torch.float).npu()
+
+    addr = int(tensor.data_ptr())
+    cache = cache_manager.register_blocks_cache(cache_desc, [addr],
+                                                BlocksCacheKey(ip_port_to_int(args.local_ip_port), 0))
+    logging.info('register_blocks_cache success')
+    logging.info(f'before decoder pull, tensor={tensor}')
+
+    time.sleep(30)
+    cache_manager.unregister_cache(cache.cache_id)
+    datadist.finalize()
+    logging.info('[finalize] success')
+
+
+def run_decoder_sample(datadist, args):
+    # 1. 注册内存
+    cache_manager = datadist.cache_manager
+    cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT,
+                           placement=Placement.DEVICE)
+    remote_list = args.remote_ip_port.split(';')
+
+    tensor = torch.full((BLOCKS_NUM, KV_SHAPE), 0, dtype=torch.float).npu()
+    addr = int(tensor.data_ptr())
+    cache = cache_manager.register_blocks_cache(cache_desc, [addr],
+                                                BlocksCacheKey(ip_port_to_int(args.local_ip_port), 0))
+    logging.info('register_blocks_cache success')
+
+    time.sleep(5) # register end
+
+    # 2. 向所有prompt建链
+    cluster_list = []
+    for remote in remote_list:
+        cluster = LLMClusterInfo()
+        cluster.remote_cluster_id = ip_port_to_int(remote)
+        cluster.append_local_ip_info(args.local_ip_port.split(':')[0], 0)
+        cluster.append_remote_ip_info(remote.split(':')[0], int(remote.split(':')[1]))
+        cluster_list.append(cluster)
+    ret, _ = datadist.link_clusters(cluster_list, 5000)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("link failed")
+
+    # 3. 向prompt pull blocks
+    for i, remote in enumerate(remote_list):
+        cache_manager.pull_blocks(BlocksCacheKey(ip_port_to_int(remote), 0),
+                                  cache, src_blocks=[0, 1], dst_blocks=[0, 2])
+        logging.info(f'after decoder pull from {ip_port_to_int(remote)}, tensor={tensor}')
+
+    # 4. 断链
+    ret, _ = datadist.unlink_clusters(cluster_list, 5000)
+    if ret != LLMStatusCode.LLM_SUCCESS:
+        raise Exception("unlink failed")
+
+    cache_manager.unregister_cache(cache.cache_id)
+    datadist.finalize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device_id", type=int, default=0, help='device id')
+    parser.add_argument("--role", type=str, default=1, help='role type, support p/d')
+    parser.add_argument("--local_ip_port", type=str, help='local ip port, eg:10.10.10.1:26000')
+    parser.add_argument("--remote_ip_port", type=str,
+                        help='remote host ip list, eg:10.10.10.2:26000;10.10.10.3:26000')
+    args = parser.parse_args()
+    if args.role not in ['p', 'd']:
+        raise RuntimeError("Not supported cluster id")
+    if args.device_id not in [0, 1, 2, 3, 4, 5, 6, 7]:
+        raise RuntimeError("Not supported device id")
+    if args.local_ip_port is None:
+        raise RuntimeError("local_ip_port is not set")
+    if args.role == 'd':
+        if args.remote_ip_port is None:
+            raise RuntimeError("remote_ip_port is not set")
+    logging.info(f'Sample start, device_id = {args.device_id}, role = {args.role}')
+
+    torch.npu.set_device(args.device_id)
+    role = LLMRole.PROMPT if args.role == 'p' else LLMRole.DECODER
+    datadist = init_llm_datadist(args)
+    if role == LLMRole.PROMPT:
+        run_prompt_sample(datadist, args)
+    else:
+        run_decoder_sample(datadist, args)
+    logging.info('Sample end')
diff --git a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py
index d4c971629..074c62533 100644
--- a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py
+++ b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py
@@ -73,6 +73,9 @@ def run_prompt_sample(datadist):
     # 2. 等decoder pull blocks
     dist.barrier() # decoder push blocks end
 
+    logging.info(f'after decoder push, {tensor=}')
+    logging.info(f'after decoder push, {tensor2=}')
+
     # 3. 解链
     cluster = LLMClusterInfo()
     cluster.remote_cluster_id = DECODER_CLUSTER_ID
@@ -108,9 +111,7 @@ def run_decoder_sample(datadist, local_host_ip, remote_host_ip):
         raise Exception("link failed")
 
     # 3. 向prompt push blocks
-    cache_manager.pull_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1], dst_blocks=[0, 2])
-    logging.info(f'after decoder pull, {tensor=}')
-    logging.info(f'after decoder pull, {tensor2=}')
+    cache_manager.push_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1], dst_blocks=[0, 2])
 
     dist.barrier() # push_blocks end
 
-- 
Gitee


From e2cab24744795a89d969bccd709ab11405b23a18 Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Tue, 16 Sep 2025 13:28:24 +0000
Subject: [PATCH 4/7] !2761 simple add hello world samples Merge pull request
 !2761 from renjie/master

---
 .../23_simple_add/CMakeLists.txt              |  15 ++
 .../0_introduction/23_simple_add/README.md    |  86 +++++++++
 .../23_simple_add/add_custom.cpp              | 180 ++++++++++++++++++
 .../24_simple_hello_world/CMakeLists.txt      |  15 ++
 .../24_simple_hello_world/README.md           |  54 ++++++
 .../24_simple_hello_world/hello_world.cpp     |  35 ++++
 6 files changed, 385 insertions(+)
 create mode 100644 operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/23_simple_add/README.md
 create mode 100644 operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
 create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/README.md
 create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp

diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
new file mode 100644
index 000000000..b3e88f157
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+set_source_files_properties(
+    add_custom.cpp PROPERTIES LANGUAGE ASC
+)
+
+add_executable(demo
+    add_custom.cpp
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/23_simple_add/README.md
new file mode 100644
index 000000000..4b3dce29f
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/README.md
@@ -0,0 +1,86 @@
+## 简化Add算子直调样例
+本样例以Add算子为示例，展示了一种更为简单的算子编译流程，支持main函数和Kernel函数在同一个cpp文件中实现。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+## 目录结构介绍
+```
+├── 23_simple_add
+│   ├── CMakeLists.txt      // 编译工程文件
+│   └── add_custom.cpp      // 算子实现及测试
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：  
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 代码实现介绍
+- kernel实现  
+  Add算子的数学表达式为：
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+- tiling实现  
+  TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数，本示例算子使用了2个tiling参数：totalLength、tileNum。totalLength是指需要计算的数据量大小，tileNum是指每个核上总计算数据分块个数。比如，totalLength这个参数传递到kernel侧后，可以通过除以参与计算的核数，得到每个核上的计算量，这样就完成了多核数据的切分。
+
+- 调用实现  
+  使用内核调用符<<<>>>调用核函数。
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+
+  - 样例执行
+    ```bash
+    mkdir -p build && cd build;   # 创建并进入build目录
+    cmake ..;make -j;             # 编译工程
+    ./demo                        # 执行样例
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/15 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
new file mode 100644
index 000000000..d2b5cb112
--- /dev/null
+++ b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
@@ -0,0 +1,180 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include "acl/acl.h"
+#include "kernel_operator.h"
+
+constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue
+
+struct AddCustomTilingData
+{
+    uint32_t totalLength;
+    uint32_t tileNum;
+};
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = tileNum;
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+        xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<float>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+
+__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+    KernelAdd op;
+    op.Init(x, y, z, tiling.totalLength, tiling.tileNum);
+    op.Process();
+}
+
+std::vector<float> kernel_add(std::vector<float> &x, std::vector<float> &y)
+{
+    constexpr uint32_t blockDim = 8;
+    uint32_t totalLength = x.size();
+    size_t totalByteSize = totalLength * sizeof(float);
+    int32_t deviceId = 0;
+    aclrtStream stream = nullptr;
+    AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8};
+    uint8_t *xHost = reinterpret_cast<uint8_t *>(x.data());
+    uint8_t *yHost = reinterpret_cast<uint8_t *>(y.data());
+    uint8_t *zHost = nullptr;
+    uint8_t *xDevice = nullptr;
+    uint8_t *yDevice = nullptr;
+    uint8_t *zDevice = nullptr;
+
+    aclInit(nullptr);
+    aclrtSetDevice(deviceId);
+    aclrtCreateStream(&stream);
+
+    aclrtMallocHost((void **)(&zHost), totalByteSize);
+    aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST);
+
+    aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
+    aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    add_custom<<<blockDim, nullptr, stream>>>(xDevice, yDevice, zDevice, tiling);
+    aclrtSynchronizeStream(stream);
+
+    aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST);
+    std::vector<float> z((float *)zHost, (float *)(zHost + totalLength));
+
+    aclrtFree(xDevice);
+    aclrtFree(yDevice);
+    aclrtFree(zDevice);
+    aclrtFreeHost(zHost);
+
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+
+    return z;
+}
+
+uint32_t VerifyResult(std::vector<float> &output, std::vector<float> &golden)
+{
+    auto printTensor = [](std::vector<float> &tensor, const char *name) {
+        constexpr size_t maxPrintSize = 20;
+        std::cout << name << ": ";
+        std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize),
+            std::ostream_iterator<float>(std::cout, " "));
+        if (tensor.size() > maxPrintSize) {
+            std::cout << "...";
+        }
+        std::cout << std::endl;
+    };
+    printTensor(output, "Output");
+    printTensor(golden, "Golden");
+    if (std::equal(output.begin(), output.end(), golden.begin())) {
+        std::cout << "[Success] Case accuracy is verification passed." << std::endl;
+        return 0;
+    } else {
+        std::cout << "[Failed] Case accuracy is verification failed!" << std::endl;
+        return 1;
+    }
+    return 0;
+}
+
+int32_t main(int32_t argc, char *argv[])
+{
+    constexpr uint32_t totalLength = 8 * 2048;
+    constexpr float valueX = 1.2f;
+    constexpr float valueY = 2.3f;
+    std::vector<float> x(totalLength, valueX);
+    std::vector<float> y(totalLength, valueY);
+
+    std::vector<float> output = kernel_add(x, y);
+
+    std::vector<float> golden(totalLength, valueX + valueY);
+    return VerifyResult(output, golden);
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
new file mode 100644
index 000000000..590f26516
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.16)
+
+set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version")
+
+find_package(ASC REQUIRED)
+
+project(kernel_samples LANGUAGES ASC CXX)
+
+set_source_files_properties(
+    hello_world.cpp PROPERTIES LANGUAGE ASC
+)
+
+add_executable(demo
+    hello_world.cpp
+)
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
new file mode 100644
index 000000000..53ab1d8cc
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md
@@ -0,0 +1,54 @@
+## 简化HelloWorld算子直调样例
+本样例通过使用<<<>>>内核调用符来完成算子核函数在NPU侧运行验证的基础流程，核函数内通过printf打印输出结果。
+> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
+## 目录结构介绍
+```
+├── 24_simple_helloworld
+│   ├── CMakeLists.txt      // 编译工程文件
+│   └── hello_world.cpp     // 算子实现及测试
+```
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 运行样例算子
+  - 打开样例目录
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+  - 样例执行
+    ```bash
+    mkdir -p build && cd build;   # 创建并进入build目录
+    cmake ..;make -j;             # 编译工程
+    ./demo                        # 执行样例
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/15 | 新增本readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
new file mode 100644
index 000000000..cb28dec3d
--- /dev/null
+++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp
@@ -0,0 +1,35 @@
+/**
+ * @file hello_world.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "acl/acl.h"
+
+__global__ __aicore__ void hello_world()
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY);
+    AscendC::printf("Hello World!!!\n");
+}
+
+int32_t main(int argc, char const *argv[])
+{
+    aclInit(nullptr);
+    int32_t deviceId = 0;
+    aclrtSetDevice(deviceId);
+    aclrtStream stream = nullptr;
+    aclrtCreateStream(&stream);
+
+    constexpr uint32_t blockDim = 1;
+    hello_world<<<blockDim, nullptr, stream>>>();
+    aclrtSynchronizeStream(stream);
+
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+    return 0;
+}
\ No newline at end of file
-- 
Gitee


From 7a7d686259ef4fd27dcf874e4db8da0d2cd86ee3 Mon Sep 17 00:00:00 2001
From: renjie <renjie88@huawei.com>
Date: Wed, 17 Sep 2025 00:59:06 +0000
Subject: [PATCH 5/7] !2766 change case idx Merge pull request !2766 from
 renjie/master

---
 .../{23_simple_add => 25_simple_add}/CMakeLists.txt           | 0
 .../0_introduction/{23_simple_add => 25_simple_add}/README.md | 4 ++--
 .../{23_simple_add => 25_simple_add}/add_custom.cpp           | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/CMakeLists.txt (100%)
 rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/README.md (99%)
 rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/add_custom.cpp (100%)

diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt
similarity index 100%
rename from operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt
rename to operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt
diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md
similarity index 99%
rename from operator/ascendc/0_introduction/23_simple_add/README.md
rename to operator/ascendc/0_introduction/25_simple_add/README.md
index 4b3dce29f..ffa237fc2 100644
--- a/operator/ascendc/0_introduction/23_simple_add/README.md
+++ b/operator/ascendc/0_introduction/25_simple_add/README.md
@@ -3,7 +3,7 @@
 > ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。
 ## 目录结构介绍
 ```
-├── 23_simple_add
+├── 25_simple_add
 │   ├── CMakeLists.txt      // 编译工程文件
 │   └── add_custom.cpp      // 算子实现及测试
 ```
@@ -48,7 +48,7 @@ z = x + y
   - 打开样例目录   
     以命令行方式下载样例代码，master分支为例。
     ```bash
-    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add/
     ```
   - 配置环境变量
 
diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp
similarity index 100%
rename from operator/ascendc/0_introduction/23_simple_add/add_custom.cpp
rename to operator/ascendc/0_introduction/25_simple_add/add_custom.cpp
-- 
Gitee


From a104fad229ee46a836a038ae5b354af87e3e7891 Mon Sep 17 00:00:00 2001
From: SeaElm <yangruizhi5@huawei.com>
Date: Wed, 17 Sep 2025 12:27:02 +0000
Subject: [PATCH 6/7] !2760 add AddCustomTiny sample for frameworklaunch Merge
 pull request !2760 from SeaElm/seaelm

---
 .../AddCustomTiny/CMakeLists.txt              | 45 ++++++++++
 .../AddCustomTiny/README.md                   | 86 +++++++++++++++++++
 .../AddCustomTiny/add_custom_host.cpp         | 56 ++++++++++++
 .../AddCustomTiny/add_custom_kernel.cpp       | 86 +++++++++++++++++++
 .../AddCustomTiny/add_custom_tiling.h         | 19 ++++
 .../1_add_frameworklaunch/README.md           |  2 +
 6 files changed, 294 insertions(+)
 create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt
 create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md
 create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp
 create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp
 create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h

diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt
new file mode 100644
index 000000000..38e13a85e
--- /dev/null
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+
+set(ASCEND_COMPUTE_UNIT ascend910b)
+find_package(ASC REQUIRED)
+
+npu_op_package(${vendor_name}
+    TYPE RUN
+)
+
+file(GLOB host_ops_srcs ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_host.cpp)
+npu_op_code_gen(
+    SRC ${host_ops_srcs}
+    PACKAGE ${vendor_name}
+    OUT_DIR ${ASCEND_AUTOGEN_PATH}
+    OPTIONS
+    OPS_PRODUCT_NAME ${ASCEND_COMPUTE_UNIT}
+)
+
+file(GLOB autogen_aclnn_srcs ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+set_source_files_properties(${autogen_aclnn_srcs} PROPERTIES GENERATED TRUE)
+npu_op_library(cust_opapi ACLNN
+    ${autogen_aclnn_srcs}
+)
+
+npu_op_library(cust_optiling TILING
+    ${host_ops_srcs}
+)
+
+npu_op_kernel_library(ascendc_kernels
+    SRC_BASE ${CMAKE_SOURCE_DIR}/
+    TILING_LIBRARY cust_optiling
+)
+
+npu_op_kernel_sources(ascendc_kernels
+    OP_NAME AddCustom
+    KERNEL_FILE add_custom_kernel.cpp
+)
+
+npu_op_package_add(${vendor_name}
+    LIBRARY
+    cust_opapi
+    cust_optiling
+    ascendc_kernels
+)
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md
new file mode 100644
index 000000000..d0e4472c1
--- /dev/null
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md
@@ -0,0 +1,86 @@
+## 简化Add算子直调样例
+本样例以Add算子为示例，展示了简单、灵活的算子编译流程。
+**注意：本样例仅支持CANN8.3及以上版本。**
+## 目录结构介绍
+```
+├── AddCustomTiny
+│   ├── add_cutsom_host.cpp      // host侧编译源码文件
+│   ├── add_custom_tiling.h      // host侧编译tiling头文件
+│   ├── add_custom_kernel.cpp    // kernel侧编译源码文件
+│   ├── CMakeLists.txt           // 编译工程文件
+│   └── readme.md                // 算子实现及测试
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：  
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 代码实现介绍
+- kernel实现  
+  Add算子的数学表达式为：
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。
+- tiling实现  
+  TilingData参数设计，TilingData参数本质上是和并行数据切分相关的参数，本示例算子使用了2个tiling参数：totalLength、tileNum。totalLength是指需要计算的数据量大小，tileNum是指每个核上总计算数据分块个数。比如，totalLength这个参数传递到kernel侧后，可以通过除以参与计算的核数，得到每个核上的计算量，这样就完成了多核数据的切分。
+
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 编译样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+    配置按安装径后，执行以下命令统一配置环境变量。
+    ```bash
+    # 配置CANN环境变量
+    source ${ASCEND_INSTALL_PATH}/bin/setenv.bash
+    # 添加AscendC CMake Module搜索路径至环境变量
+    export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH
+    ```
+
+  - 样例执行
+    ```bash
+    rm -rf build && mkdir build && cd build
+    cmake .. && make -j binary package
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/09/16 | 新增readme |
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp
new file mode 100644
index 000000000..1cc87f4fb
--- /dev/null
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp
@@ -0,0 +1,56 @@
+/**
+ * @file add_custom_host.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "add_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/tiling_api.h"
+
+namespace optiling {
+const uint32_t BLOCK_DIM = 8;
+const uint32_t TILE_NUM = 8;
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    AddCustomTilingData *tiling = context->GetTilingData<AddCustomTilingData>();
+    uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
+    context->SetBlockDim(BLOCK_DIM);
+    tiling->totalLength = totalLength;
+    tiling->tileNum = TILE_NUM;
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+
+namespace ops {
+class AddCustom : public OpDef {
+public:
+    explicit AddCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND});
+        this->Input("y")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND});
+        this->Output("z")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND});
+            
+        this->AICore()
+            .SetTiling(optiling::TilingFunc)
+            .AddConfig("ascend910")
+            .AddConfig("ascend310p")
+            .AddConfig("ascend310b")
+            .AddConfig("ascend910b");
+    }
+};
+OP_ADD(AddCustom);
+} // namespace ops
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp
new file mode 100644
index 000000000..22a9876fe
--- /dev/null
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp
@@ -0,0 +1,86 @@
+/**
+ * @file add_custom_kernel.cpp
+ *
+ * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "add_custom_tiling.h"
+constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum)
+    {
+        this->blockLength = totalLength / AscendC::GetBlockNum();
+        this->tileNum = tileNum;
+        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
+
+        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = this->tileNum * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.AllocTensor<DTYPE_Y>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE_Y> yLocal = inQueueY.DeQue<DTYPE_Y>();
+        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
+        AscendC::Add(zLocal, xLocal, yLocal, this->tileLength);
+        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<DTYPE_X> xGm;
+    AscendC::GlobalTensor<DTYPE_Y> yGm;
+    AscendC::GlobalTensor<DTYPE_Z> zGm;
+    uint32_t blockLength;
+    uint32_t tileNum;
+    uint32_t tileLength;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
+{
+    REGISTER_TILING_DEFAULT(AddCustomTilingData);
+    GET_TILING_DATA(tilingData, tiling);
+    KernelAdd op;
+    op.Init(x, y, z, tilingData.totalLength, tilingData.tileNum);
+    op.Process();
+}
\ No newline at end of file
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h
new file mode 100644
index 000000000..d80ecbee4
--- /dev/null
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h
@@ -0,0 +1,19 @@
+/**
+ * @file add_custom_tiling.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef ADD_CUSTOM_TILING_H
+#define ADD_CUSTOM_TILING_H
+#include <cstdint>
+
+struct AddCustomTilingData {
+    uint32_t totalLength;
+    uint32_t tileNum;
+};
+
+#endif // ADD_CUSTOM_TILING_H
diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md
index de724d24f..5f1223f77 100644
--- a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md
+++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md
@@ -9,6 +9,7 @@
 │   ├── AclOfflineModel        // 通过aclopExecuteV2调用的方式调用AddCustom算子
 │   ├── AclOnlineModel         // 通过aclopCompile调用的方式调用AddCustom算子
 │   ├── AddCustom              // AddCustom算子工程
+│   ├── AddCustomTiny          // AddCustom自定义算子工程极简样例
 │   ├── PytorchInvocation      // 通过pytorch调用的方式调用AddCustom算子
 │   ├── TensorflowInvocation   // 通过tensorflow调用的方式调用AddCustom算子
 │   ├── CppExtensionInvocation // 通过CppExtension调用的方式调用AddCustom算子
@@ -150,3 +151,4 @@ CANN软件包中提供了工程创建工具msOpGen，AddCustom算子工程可通
 | 2024/11/11 | 样例目录调整 |
 | 2024/11/18 | 算子工程改写为由msOpGen生成 |
 | 2025/01/17 | 新增CppExtensionInvocation样例 |
+| 2025/9/17 | 新增AddCustomTiny极简工程样例                   |
\ No newline at end of file
-- 
Gitee


From d059db50fde2e76fe99336e65c1a17913e769081 Mon Sep 17 00:00:00 2001
From: Chen Ning <chenning54@huawei.com>
Date: Thu, 18 Sep 2025 11:05:38 +0000
Subject: [PATCH 7/7] add namespace AscendC::tiling to sample

Signed-off-by: Chen Ning <chenning54@huawei.com>
---
 .../MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
index 8f32f3418..4ea1394d9 100644
--- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
+++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h
@@ -15,7 +15,7 @@
 
 struct MatmulCustomTilingData {
     uint64_t localMemSize;
-    TCubeTiling cubeTilingData;
+    AscendC::tiling::TCubeTiling cubeTilingData;
 };
 
 #endif  // MATMUL_TILING_H
\ No newline at end of file
-- 
Gitee


算子类型(OpType)	Add
算子输入	name	shape	data type	format
	x	8 * 2048	float	ND
	y	8 * 2048	float	ND
算子输出	z	8 * 2048	float	ND
核函数名	add_custom