From 7a634125028d7f7ed9e60970b480a67ed25419d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=99=86?= Date: Mon, 15 Sep 2025 11:47:35 +0000 Subject: [PATCH 1/7] =?UTF-8?q?!2762=20uniquecust=20modify=20infershape=20?= =?UTF-8?q?Merge=20pull=20request=20!2762=20from=20=E5=88=98=E9=99=86/mast?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../4_op_dev/1_custom_op/op_proto/unique_cust.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h b/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h index 6d18e6739..5eee4e1d2 100644 --- a/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h +++ b/cplusplus/level1_single_api/4_op_dev/1_custom_op/op_proto/unique_cust.h @@ -19,7 +19,20 @@ #include "graph/operator_reg.h" namespace ge { +/** +*@brief Finds unique elements in a 1D tensor. \n +*@par Inputs: +*x: 1D tensor. Support all types mentioned in TensorType. +*Input "x" is a k-dimensional tensor. \n + +*@par Attributes: +*out_idx: An optional DType from: "int32, int64". Defaults to "int32". \n + +*@par Outputs: +*@li y: "x" in the unique output "y". +*@li idx: A tensor the same size as "x". The index of each value of "x". \n +*/ REG_OP(UniqueCust) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) -- Gitee From 6bf58b5ca3bc8d072ed04eb83853125561eefa33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AE=81?= Date: Mon, 15 Sep 2025 11:59:49 +0000 Subject: [PATCH 2/7] =?UTF-8?q?!2765=20sync=20pydflow=20code=20Merge=20pul?= =?UTF-8?q?l=20request=20!2765=20from=20=E6=9D=8E=E5=AE=81/dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- inference/dataflow/py_dflow/CMakeLists.txt | 2 + inference/dataflow/py_dflow/build.sh | 21 ++++- .../py_dflow/cmake/intf_pub_linux.cmake | 7 +- .../py_dflow/cmake/modules/Finddflow.cmake | 85 +++++++++++++++++++ .../py_dflow/cmake/modules/Findmetadef.cmake | 23 +---- .../py_dflow/python/dataflow/dataflow.py | 9 +- .../python/dataflow/tools/tpl/tpl_cmake.py | 2 +- .../dataflow/py_dflow/wrapper/CMakeLists.txt | 15 ++-- .../wrapper/flow_func_wrapper/CMakeLists.txt | 8 +- 9 files changed, 134 insertions(+), 38 deletions(-) create mode 100644 inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake diff --git a/inference/dataflow/py_dflow/CMakeLists.txt b/inference/dataflow/py_dflow/CMakeLists.txt index 5323a373d..69cf67fa3 100644 --- a/inference/dataflow/py_dflow/CMakeLists.txt +++ b/inference/dataflow/py_dflow/CMakeLists.txt @@ -39,12 +39,14 @@ if (BUILD_OPEN_PROJECT) include(cmake/test_funcs.cmake) include(cmake/intf_pub_linux.cmake) include(cmake/modules/Findair.cmake) + include(cmake/modules/Finddflow.cmake) include(cmake/modules/Findmetadef.cmake) include(cmake/modules/Findparser.cmake) # 自研软件包 find_package(slog MODULE REQUIRED) find_package(air MODULE REQUIRED) + find_package(dflow MODULE REQUIRED) find_package(metadef MODULE REQUIRED) find_package(parser MODULE REQUIRED) find_package(udf MODULE REQUIRED) diff --git a/inference/dataflow/py_dflow/build.sh b/inference/dataflow/py_dflow/build.sh index b236b2872..87b29aecc 100755 --- a/inference/dataflow/py_dflow/build.sh +++ b/inference/dataflow/py_dflow/build.sh @@ -23,12 +23,15 @@ BUILD_RELATIVE_PATH="build" # print usage message usage() { echo "Usage:" - echo " sh build.sh [-h | --help] [-v | --verbose] [-j] [--ascend_install_path=] [--output_path=] [--python_path=]" + echo " sh build.sh [-h | --help] [-v | --verbose] [-j] [--build_type=]" + echo " [--ascend_install_path=] [--output_path=] [--python_path=]" echo "" echo "Options:" echo " -h, --help Print usage" echo " -v, --verbose Display build command" echo " -j Set the number of threads used for building DFlow, default is 8" + echo " --build_type=" + echo " Set build type, default Release" echo " --ascend_install_path=" echo " Set ascend package install path, default /usr/local/Ascend/ascend-toolkit/latest" echo " --output_path=" @@ -38,6 +41,17 @@ usage() { echo "" } +# check value of build_type option +# usage: check_build_type build_type +check_build_type() { + arg_value="$1" + if [ "X$arg_value" != "XRelease" ] && [ "X$arg_value" != "XDebug" ]; then + echo "Invalid value $arg_value for option --$2" + usage + exit 1 + fi +} + # parse and set options checkopts() { VERBOSE="" @@ -67,6 +81,11 @@ checkopts() { VERBOSE="VERBOSE=1" shift ;; + --build_type) + check_build_type "$2" build_type + CMAKE_BUILD_TYPE="$2" + shift 2 + ;; --ascend_install_path) ASCEND_INSTALL_PATH="$(realpath $2)" shift 2 diff --git a/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake b/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake index 4c9b502e9..6561a3ac6 100644 --- a/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake +++ b/inference/dataflow/py_dflow/cmake/intf_pub_linux.cmake @@ -9,13 +9,12 @@ target_compile_options(intf_pub INTERFACE -Wall -fPIC $,-fstack-protector-all,-fstack-protector-strong> - $<$:-std=c++11> + $<$:-std=c++17> ) target_compile_definitions(intf_pub INTERFACE - _GLIBCXX_USE_CXX11_ABI=0 + _GLIBCXX_USE_CXX11_ABI=0 $<$:CFG_BUILD_NDEBUG> - $<$:CFG_BUILD_DEBUG> - WIN64=1 + $<$:CFG_BUILD_DEBUG> LINUX=0 LOG_CPP ) diff --git a/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake b/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake new file mode 100644 index 000000000..aa51a2793 --- /dev/null +++ b/inference/dataflow/py_dflow/cmake/modules/Finddflow.cmake @@ -0,0 +1,85 @@ +if (dflow_FOUND) + message(STATUS "Package dflow has been found.") + return() +endif() + +set(_cmake_targets_defined "") +set(_cmake_targets_not_defined "") +set(_cmake_expected_targets "") +foreach(_cmake_expected_target IN ITEMS flow_graph dflow_headers) + list(APPEND _cmake_expected_targets "${_cmake_expected_target}") + if(TARGET "${_cmake_expected_target}") + list(APPEND _cmake_targets_defined "${_cmake_expected_target}") + else() + list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}") + endif() +endforeach() +unset(_cmake_expected_target) + +if(_cmake_targets_defined STREQUAL _cmake_expected_targets) + unset(_cmake_targets_defined) + unset(_cmake_targets_not_defined) + unset(_cmake_expected_targets) + unset(CMAKE_IMPORT_FILE_VERSION) + cmake_policy(POP) + return() +endif() + +if(NOT _cmake_targets_defined STREQUAL "") + string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}") + string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}") + message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n") +endif() +unset(_cmake_targets_defined) +unset(_cmake_targets_not_defined) +unset(_cmake_expected_targets) + +find_path(_INCLUDE_DIR + NAMES flow_graph/data_flow.h + NO_CMAKE_SYSTEM_PATH + NO_CMAKE_FIND_ROOT_PATH) + +find_library(flow_graph_SHARED_LIBRARY + NAMES libflow_graph.so + PATH_SUFFIXES lib64 + NO_CMAKE_SYSTEM_PATH + NO_CMAKE_FIND_ROOT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(dflow + FOUND_VAR + dflow_FOUND + REQUIRED_VARS + _INCLUDE_DIR + flow_graph_SHARED_LIBRARY +) + +if(dflow_FOUND) + set(dflow_INCLUDE_DIR "${_INCLUDE_DIR}") + include(CMakePrintHelpers) + message(STATUS "Variables in dflow module:") + cmake_print_variables(dflow_INCLUDE_DIR) + cmake_print_variables(flow_graph_SHARED_LIBRARY) + + add_library(flow_graph SHARED IMPORTED) + set_target_properties(flow_graph PROPERTIES + INTERFACE_LINK_LIBRARIES "dflow_headers" + IMPORTED_LOCATION "${flow_graph_SHARED_LIBRARY}" + ) + + add_library(dflow_headers INTERFACE IMPORTED) + set_target_properties(dflow_headers PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${dflow_INCLUDE_DIR}" + ) + + include(CMakePrintHelpers) + cmake_print_properties(TARGETS flow_graph + PROPERTIES INTERFACE_LINK_LIBRARIES IMPORTED_LOCATION + ) + cmake_print_properties(TARGETS dflow_headers + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ) +endif() + +# Cleanup temporary variables. +set(_INCLUDE_DIR) diff --git a/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake b/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake index 750835bc7..67f9a3dfd 100644 --- a/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake +++ b/inference/dataflow/py_dflow/cmake/modules/Findmetadef.cmake @@ -1,12 +1,12 @@ if (metadef_FOUND) - message(STATUS "Package air has been found.") + message(STATUS "Package metadef has been found.") return() endif() set(_cmake_targets_defined "") set(_cmake_targets_not_defined "") set(_cmake_expected_targets "") -foreach(_cmake_expected_target IN ITEMS flow_graph metadef_headers) +foreach(_cmake_expected_target IN ITEMS metadef_headers) list(APPEND _cmake_expected_targets "${_cmake_expected_target}") if(TARGET "${_cmake_expected_target}") list(APPEND _cmake_targets_defined "${_cmake_expected_target}") @@ -35,13 +35,7 @@ unset(_cmake_targets_not_defined) unset(_cmake_expected_targets) find_path(_INCLUDE_DIR - NAMES flow_graph/data_flow.h - NO_CMAKE_SYSTEM_PATH - NO_CMAKE_FIND_ROOT_PATH) - -find_library(flow_graph_SHARED_LIBRARY - NAMES libflow_graph.so - PATH_SUFFIXES lib64 + NAMES graph/types.h NO_CMAKE_SYSTEM_PATH NO_CMAKE_FIND_ROOT_PATH) @@ -51,7 +45,6 @@ find_package_handle_standard_args(metadef metadef_FOUND REQUIRED_VARS _INCLUDE_DIR - flow_graph_SHARED_LIBRARY ) if(metadef_FOUND) @@ -59,13 +52,6 @@ if(metadef_FOUND) include(CMakePrintHelpers) message(STATUS "Variables in metadef module:") cmake_print_variables(metadef_INCLUDE_DIR) - cmake_print_variables(flow_graph_SHARED_LIBRARY) - - add_library(flow_graph SHARED IMPORTED) - set_target_properties(flow_graph PROPERTIES - INTERFACE_LINK_LIBRARIES "metadef_headers" - IMPORTED_LOCATION "${flow_graph_SHARED_LIBRARY}" - ) add_library(metadef_headers INTERFACE IMPORTED) set_target_properties(metadef_headers PROPERTIES @@ -73,9 +59,6 @@ if(metadef_FOUND) ) include(CMakePrintHelpers) - cmake_print_properties(TARGETS flow_graph - PROPERTIES INTERFACE_LINK_LIBRARIES IMPORTED_LOCATION - ) cmake_print_properties(TARGETS metadef_headers PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ) diff --git a/inference/dataflow/py_dflow/python/dataflow/dataflow.py b/inference/dataflow/py_dflow/python/dataflow/dataflow.py index 73d70805a..8b4fb177f 100644 --- a/inference/dataflow/py_dflow/python/dataflow/dataflow.py +++ b/inference/dataflow/py_dflow/python/dataflow/dataflow.py @@ -1286,15 +1286,16 @@ class FlowGraph(object): "for details about the error information, see the ascend log.", dwrapper.INNER_ERROR, ) + flow_info.start_time = ret[2].start_time + flow_info.end_time = ret[2].end_time + flow_info.flow_flags = ret[2].flow_flags + flow_info.transaction_id = ret[2].transaction_id if ret[0].ret_code != 0 and ret[0].ret_code != dwrapper.SUBHEALTHY: log.error("failed to fetch data, error msg = %s", ret[0].error_msg) + return (outputs, flow_info, ret[0].ret_code) for output in ret[1]: outputs.append(Tensor(output)) - flow_info.start_time = ret[2].start_time - flow_info.end_time = ret[2].end_time - flow_info.flow_flags = ret[2].flow_flags - flow_info.transaction_id = ret[2].transaction_id return (outputs, flow_info, ret[0].ret_code) def feed( diff --git a/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py b/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py index 7c4beeab7..e1f727e19 100644 --- a/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py +++ b/inference/dataflow/py_dflow/python/dataflow/tools/tpl/tpl_cmake.py @@ -93,7 +93,7 @@ add_library(${UDF_TARGET_LIB} SHARED target_compile_options(${UDF_TARGET_LIB} PRIVATE -O2 - -std=c++11 + -std=c++17 -ftrapv -fstack-protector-all -fPIC diff --git a/inference/dataflow/py_dflow/wrapper/CMakeLists.txt b/inference/dataflow/py_dflow/wrapper/CMakeLists.txt index 87e5b99e4..920557436 100644 --- a/inference/dataflow/py_dflow/wrapper/CMakeLists.txt +++ b/inference/dataflow/py_dflow/wrapper/CMakeLists.txt @@ -29,14 +29,17 @@ set_target_properties(dflow_wrapper PREFIX "" ) +target_compile_definitions(dflow_wrapper PRIVATE + PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF +) + target_compile_options(dflow_wrapper PRIVATE -O2 - -std=c++11 -Xlinker -export-dynamic ) target_link_options(dflow_wrapper PRIVATE - -s + $<$:-s> ) project(data_wrapper) @@ -47,6 +50,10 @@ target_include_directories(data_wrapper PRIVATE ${pybind11_INCLUDE_DIR} ) +target_compile_definitions(data_wrapper PRIVATE + PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF +) + target_link_libraries(data_wrapper PRIVATE $ $ @@ -60,13 +67,11 @@ set_target_properties(data_wrapper target_compile_options(data_wrapper PRIVATE -O2 - -std=c++11 - -s -Xlinker -export-dynamic ) target_link_options(data_wrapper PRIVATE - -s + $<$:-s> ) add_subdirectory(flow_func_wrapper) diff --git a/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt b/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt index c4b3e35cd..64800eba6 100644 --- a/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt +++ b/inference/dataflow/py_dflow/wrapper/flow_func_wrapper/CMakeLists.txt @@ -23,13 +23,15 @@ set_target_properties(flowfunc_wrapper PREFIX "" ) +target_compile_definitions(flowfunc_wrapper PRIVATE + PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF +) + target_compile_options(flowfunc_wrapper PRIVATE -O2 - -std=c++11 - -s -Xlinker -export-dynamic ) target_link_options(flowfunc_wrapper PRIVATE - -s + $<$:-s> ) \ No newline at end of file -- Gitee From 408eccf6434fe07c606b47646bd5db489b2fc40b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E6=99=BA=E6=85=A7?= Date: Tue, 16 Sep 2025 06:58:26 +0000 Subject: [PATCH 3/7] =?UTF-8?q?!2759=20add=20xPyD=20sample=20Merge=20pull?= =?UTF-8?q?=20request=20!2759=20from=20=E8=B5=B5=E6=99=BA=E6=85=A7/zzh=5Fa?= =?UTF-8?q?dd=5Fllm=5Fdatadist?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../10_llm_data_dist/README.md | 10 ++ .../pull_blocks_xpyd_sample.py | 150 ++++++++++++++++++ .../push_blocks_sample.py | 7 +- 3 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py diff --git a/python/level1_single_api/10_llm_data_dist/README.md b/python/level1_single_api/10_llm_data_dist/README.md index 6dc0a99d7..8b1a3f6b7 100644 --- a/python/level1_single_api/10_llm_data_dist/README.md +++ b/python/level1_single_api/10_llm_data_dist/README.md @@ -122,5 +122,15 @@ # Decoder主机: GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python switch_role_sample.py --device_id 1 --role d --local_host_ip 10.170.10.1 --remote_host_ip 10.170.10.0 ``` + - pull_blocks_xpyd_sample.py:此样例程序支持xPyD测试场景,使用单侧建链方式,每个进程申请内存并注册blocks, 每个decoder和所有的prompt发起建链, 并pull blocks到本地,local_ip_port指定本地host ip和端口, + 分别在Prompt主机与Decoder主机,执行样例程序: + ``` + # 任意个Prompt主机: + GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 0 --role p --local_ip_port 10.170.10.0:26000 + GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 1 --role p --local_ip_port 10.170.10.0:26001 + # 任意个Decoder主机: + GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 2 --role d --local_ip_port 10.170.10.0:26002 --remote_ip_port '10.170.10.0:26000;10.170.10.0:26001' + GLOO_SOCKET_IFNAME=enp67s0f5 HCCL_INTRA_ROCE_ENABLE=1 python pull_blocks_xpyd_sample.py --device_id 3 --role d --local_ip_port 10.170.10.0:26003 --remote_ip_port '10.170.10.0:26000;10.170.10.0:26001' + ``` **注**:**GLOO_SOCKET_IFNAME**为本地网卡名,可通过ifconfig查询;**HCCL_INTRA_ROCE_ENABLE=1**代表使用roce方式进行通信; diff --git a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py new file mode 100644 index 000000000..aeaf76871 --- /dev/null +++ b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/pull_blocks_xpyd_sample.py @@ -0,0 +1,150 @@ +""" +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import argparse +import time +import logging +from llm_datadist import LLMDataDist, LLMRole, LLMConfig, CacheDesc, DataType, BlocksCacheKey, \ + Placement, LLMClusterInfo, LLMStatusCode +import torch +import torch_npu +import torchair +import socket +import struct + +logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) + +NUM_TENSORS = 1 +BLOCKS_NUM = 3 +KV_SHAPE = 10 + +def ip_port_to_int(ip_port): + ip, port_str = ip_port.split(':') + port = int(port_str) + if not (0 <= port <= 65535): + raise ValueError("端口号必须在0-65535之间") + # 将IP转换为4字节二进制 + ip_bytes = socket.inet_aton(ip) + + # 将4字节IP转换为32位整数 + ip_int = struct.unpack('!I', ip_bytes)[0] + + # 组合IP整数(32位)和端口(16位)为一个48位整数 + result = (ip_int << 16) | port + return result + + +def init_llm_datadist(args) -> LLMDataDist: + datadist = LLMDataDist(role, ip_port_to_int(args.local_ip_port)) + llm_config = LLMConfig() + llm_config.device_id = args.device_id + llm_config.local_comm_res = "" + if args.role == 'p': + llm_config.listen_ip_info = args.local_ip_port + llm_options = llm_config.generate_options() + datadist.init(llm_options) + logging.info(f"init {role} success, cluster_id={ip_port_to_int(args.local_ip_port)}") + return datadist + + +def run_prompt_sample(datadist, args): + # 1. 注册内存 + cache_manager = datadist.cache_manager + cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT, + placement=Placement.DEVICE) + tensor = torch.full((BLOCKS_NUM, KV_SHAPE), ip_port_to_int(args.local_ip_port), dtype=torch.float).npu() + + addr = int(tensor.data_ptr()) + cache = cache_manager.register_blocks_cache(cache_desc, [addr], + BlocksCacheKey(ip_port_to_int(args.local_ip_port), 0)) + logging.info('register_blocks_cache success') + logging.info(f'before decoder pull, tensor={tensor}') + + time.sleep(30) + cache_manager.unregister_cache(cache.cache_id) + datadist.finalize() + logging.info('[finalize] success') + + +def run_decoder_sample(datadist, args): + # 1. 注册内存 + cache_manager = datadist.cache_manager + cache_desc = CacheDesc(num_tensors=NUM_TENSORS, shape=[BLOCKS_NUM, KV_SHAPE], data_type=DataType.DT_FLOAT, + placement=Placement.DEVICE) + remote_list = args.remote_ip_port.split(';') + + tensor = torch.full((BLOCKS_NUM, KV_SHAPE), 0, dtype=torch.float).npu() + addr = int(tensor.data_ptr()) + cache = cache_manager.register_blocks_cache(cache_desc, [addr], + BlocksCacheKey(ip_port_to_int(args.local_ip_port), 0)) + logging.info('register_blocks_cache success') + + time.sleep(5) # register end + + # 2. 向所有prompt建链 + cluster_list = [] + for remote in remote_list: + cluster = LLMClusterInfo() + cluster.remote_cluster_id = ip_port_to_int(remote) + cluster.append_local_ip_info(args.local_ip_port.split(':')[0], 0) + cluster.append_remote_ip_info(remote.split(':')[0], int(remote.split(':')[1])) + cluster_list.append(cluster) + ret, _ = datadist.link_clusters(cluster_list, 5000) + if ret != LLMStatusCode.LLM_SUCCESS: + raise Exception("link failed") + + # 3. 向prompt pull blocks + for i, remote in enumerate(remote_list): + cache_manager.pull_blocks(BlocksCacheKey(ip_port_to_int(remote), 0), + cache, src_blocks=[0, 1], dst_blocks=[0, 2]) + logging.info(f'after decoder pull from {ip_port_to_int(remote)}, tensor={tensor}') + + # 4. 断链 + ret, _ = datadist.unlink_clusters(cluster_list, 5000) + if ret != LLMStatusCode.LLM_SUCCESS: + raise Exception("unlink failed") + + cache_manager.unregister_cache(cache.cache_id) + datadist.finalize() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--device_id", type=int, default=0, help='device id') + parser.add_argument("--role", type=str, default=1, help='role type, support p/d') + parser.add_argument("--local_ip_port", type=str, help='local ip port, eg:10.10.10.1:26000') + parser.add_argument("--remote_ip_port", type=str, + help='remote host ip list, eg:10.10.10.2:26000;10.10.10.3:26000') + args = parser.parse_args() + if args.role not in ['p', 'd']: + raise RuntimeError("Not supported cluster id") + if args.device_id not in [0, 1, 2, 3, 4, 5, 6, 7]: + raise RuntimeError("Not supported device id") + if args.local_ip_port is None: + raise RuntimeError("local_ip_port is not set") + if args.role == 'd': + if args.remote_ip_port is None: + raise RuntimeError("remote_ip_port is not set") + logging.info(f'Sample start, device_id = {args.device_id}, role = {args.role}') + + torch.npu.set_device(args.device_id) + role = LLMRole.PROMPT if args.role == 'p' else LLMRole.DECODER + datadist = init_llm_datadist(args) + if role == LLMRole.PROMPT: + run_prompt_sample(datadist, args) + else: + run_decoder_sample(datadist, args) + logging.info('Sample end') diff --git a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py index d4c971629..074c62533 100644 --- a/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py +++ b/python/level1_single_api/10_llm_data_dist/cache_manager_api_samples/push_blocks_sample.py @@ -73,6 +73,9 @@ def run_prompt_sample(datadist): # 2. 等decoder pull blocks dist.barrier() # decoder push blocks end + logging.info(f'after decoder push, {tensor=}') + logging.info(f'after decoder push, {tensor2=}') + # 3. 解链 cluster = LLMClusterInfo() cluster.remote_cluster_id = DECODER_CLUSTER_ID @@ -108,9 +111,7 @@ def run_decoder_sample(datadist, local_host_ip, remote_host_ip): raise Exception("link failed") # 3. 向prompt push blocks - cache_manager.pull_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1], dst_blocks=[0, 2]) - logging.info(f'after decoder pull, {tensor=}') - logging.info(f'after decoder pull, {tensor2=}') + cache_manager.push_blocks(BlocksCacheKey(PROMPT_CLUSTER_ID, 0), cache, src_blocks=[0, 1], dst_blocks=[0, 2]) dist.barrier() # push_blocks end -- Gitee From e2cab24744795a89d969bccd709ab11405b23a18 Mon Sep 17 00:00:00 2001 From: renjie Date: Tue, 16 Sep 2025 13:28:24 +0000 Subject: [PATCH 4/7] !2761 simple add hello world samples Merge pull request !2761 from renjie/master --- .../23_simple_add/CMakeLists.txt | 15 ++ .../0_introduction/23_simple_add/README.md | 86 +++++++++ .../23_simple_add/add_custom.cpp | 180 ++++++++++++++++++ .../24_simple_hello_world/CMakeLists.txt | 15 ++ .../24_simple_hello_world/README.md | 54 ++++++ .../24_simple_hello_world/hello_world.cpp | 35 ++++ 6 files changed, 385 insertions(+) create mode 100644 operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/23_simple_add/README.md create mode 100644 operator/ascendc/0_introduction/23_simple_add/add_custom.cpp create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/README.md create mode 100644 operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt new file mode 100644 index 000000000..b3e88f157 --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +project(kernel_samples LANGUAGES ASC CXX) + +set_source_files_properties( + add_custom.cpp PROPERTIES LANGUAGE ASC +) + +add_executable(demo + add_custom.cpp +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/23_simple_add/README.md new file mode 100644 index 000000000..4b3dce29f --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/README.md @@ -0,0 +1,86 @@ +## 简化Add算子直调样例 +本样例以Add算子为示例,展示了一种更为简单的算子编译流程,支持main函数和Kernel函数在同一个cpp文件中实现。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 +## 目录结构介绍 +``` +├── 23_simple_add +│ ├── CMakeLists.txt // 编译工程文件 +│ └── add_custom.cpp // 算子实现及测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输出z8 * 2048floatND
核函数名add_custom
+ +## 代码实现介绍 +- kernel实现 + Add算子的数学表达式为: + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 +- tiling实现 + TilingData参数设计,TilingData参数本质上是和并行数据切分相关的参数,本示例算子使用了2个tiling参数:totalLength、tileNum。totalLength是指需要计算的数据量大小,tileNum是指每个核上总计算数据分块个数。比如,totalLength这个参数传递到kernel侧后,可以通过除以参与计算的核数,得到每个核上的计算量,这样就完成了多核数据的切分。 + +- 调用实现 + 使用内核调用符<<<>>>调用核函数。 + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + + - 样例执行 + ```bash + mkdir -p build && cd build; # 创建并进入build目录 + cmake ..;make -j; # 编译工程 + ./demo # 执行样例 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/15 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp new file mode 100644 index 000000000..d2b5cb112 --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp @@ -0,0 +1,180 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "kernel_operator.h" + +constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue + +struct AddCustomTilingData +{ + uint32_t totalLength; + uint32_t tileNum; +}; + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = tileNum; + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + KernelAdd op; + op.Init(x, y, z, tiling.totalLength, tiling.tileNum); + op.Process(); +} + +std::vector kernel_add(std::vector &x, std::vector &y) +{ + constexpr uint32_t blockDim = 8; + uint32_t totalLength = x.size(); + size_t totalByteSize = totalLength * sizeof(float); + int32_t deviceId = 0; + aclrtStream stream = nullptr; + AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8}; + uint8_t *xHost = reinterpret_cast(x.data()); + uint8_t *yHost = reinterpret_cast(y.data()); + uint8_t *zHost = nullptr; + uint8_t *xDevice = nullptr; + uint8_t *yDevice = nullptr; + uint8_t *zDevice = nullptr; + + aclInit(nullptr); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + aclrtMallocHost((void **)(&zHost), totalByteSize); + aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + + aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); + + add_custom<<>>(xDevice, yDevice, zDevice, tiling); + aclrtSynchronizeStream(stream); + + aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST); + std::vector z((float *)zHost, (float *)(zHost + totalLength)); + + aclrtFree(xDevice); + aclrtFree(yDevice); + aclrtFree(zDevice); + aclrtFreeHost(zHost); + + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return z; +} + +uint32_t VerifyResult(std::vector &output, std::vector &golden) +{ + auto printTensor = [](std::vector &tensor, const char *name) { + constexpr size_t maxPrintSize = 20; + std::cout << name << ": "; + std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize), + std::ostream_iterator(std::cout, " ")); + if (tensor.size() > maxPrintSize) { + std::cout << "..."; + } + std::cout << std::endl; + }; + printTensor(output, "Output"); + printTensor(golden, "Golden"); + if (std::equal(output.begin(), output.end(), golden.begin())) { + std::cout << "[Success] Case accuracy is verification passed." << std::endl; + return 0; + } else { + std::cout << "[Failed] Case accuracy is verification failed!" << std::endl; + return 1; + } + return 0; +} + +int32_t main(int32_t argc, char *argv[]) +{ + constexpr uint32_t totalLength = 8 * 2048; + constexpr float valueX = 1.2f; + constexpr float valueY = 2.3f; + std::vector x(totalLength, valueX); + std::vector y(totalLength, valueY); + + std::vector output = kernel_add(x, y); + + std::vector golden(totalLength, valueX + valueY); + return VerifyResult(output, golden); +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt new file mode 100644 index 000000000..590f26516 --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +project(kernel_samples LANGUAGES ASC CXX) + +set_source_files_properties( + hello_world.cpp PROPERTIES LANGUAGE ASC +) + +add_executable(demo + hello_world.cpp +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md new file mode 100644 index 000000000..53ab1d8cc --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md @@ -0,0 +1,54 @@ +## 简化HelloWorld算子直调样例 +本样例通过使用<<<>>>内核调用符来完成算子核函数在NPU侧运行验证的基础流程,核函数内通过printf打印输出结果。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 +## 目录结构介绍 +``` +├── 24_simple_helloworld +│ ├── CMakeLists.txt // 编译工程文件 +│ └── hello_world.cpp // 算子实现及测试 +``` + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + - 样例执行 + ```bash + mkdir -p build && cd build; # 创建并进入build目录 + cmake ..;make -j; # 编译工程 + ./demo # 执行样例 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/15 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp new file mode 100644 index 000000000..cb28dec3d --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp @@ -0,0 +1,35 @@ +/** + * @file hello_world.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "acl/acl.h" + +__global__ __aicore__ void hello_world() +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY); + AscendC::printf("Hello World!!!\n"); +} + +int32_t main(int argc, char const *argv[]) +{ + aclInit(nullptr); + int32_t deviceId = 0; + aclrtSetDevice(deviceId); + aclrtStream stream = nullptr; + aclrtCreateStream(&stream); + + constexpr uint32_t blockDim = 1; + hello_world<<>>(); + aclrtSynchronizeStream(stream); + + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return 0; +} \ No newline at end of file -- Gitee From 7a7d686259ef4fd27dcf874e4db8da0d2cd86ee3 Mon Sep 17 00:00:00 2001 From: renjie Date: Wed, 17 Sep 2025 00:59:06 +0000 Subject: [PATCH 5/7] !2766 change case idx Merge pull request !2766 from renjie/master --- .../{23_simple_add => 25_simple_add}/CMakeLists.txt | 0 .../0_introduction/{23_simple_add => 25_simple_add}/README.md | 4 ++-- .../{23_simple_add => 25_simple_add}/add_custom.cpp | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/CMakeLists.txt (100%) rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/README.md (99%) rename operator/ascendc/0_introduction/{23_simple_add => 25_simple_add}/add_custom.cpp (100%) diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt similarity index 100% rename from operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt rename to operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md similarity index 99% rename from operator/ascendc/0_introduction/23_simple_add/README.md rename to operator/ascendc/0_introduction/25_simple_add/README.md index 4b3dce29f..ffa237fc2 100644 --- a/operator/ascendc/0_introduction/23_simple_add/README.md +++ b/operator/ascendc/0_introduction/25_simple_add/README.md @@ -3,7 +3,7 @@ > ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 ## 目录结构介绍 ``` -├── 23_simple_add +├── 25_simple_add │ ├── CMakeLists.txt // 编译工程文件 │ └── add_custom.cpp // 算子实现及测试 ``` @@ -48,7 +48,7 @@ z = x + y - 打开样例目录 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/ + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add/ ``` - 配置环境变量 diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp similarity index 100% rename from operator/ascendc/0_introduction/23_simple_add/add_custom.cpp rename to operator/ascendc/0_introduction/25_simple_add/add_custom.cpp -- Gitee From a104fad229ee46a836a038ae5b354af87e3e7891 Mon Sep 17 00:00:00 2001 From: SeaElm Date: Wed, 17 Sep 2025 12:27:02 +0000 Subject: [PATCH 6/7] !2760 add AddCustomTiny sample for frameworklaunch Merge pull request !2760 from SeaElm/seaelm --- .../AddCustomTiny/CMakeLists.txt | 45 ++++++++++ .../AddCustomTiny/README.md | 86 +++++++++++++++++++ .../AddCustomTiny/add_custom_host.cpp | 56 ++++++++++++ .../AddCustomTiny/add_custom_kernel.cpp | 86 +++++++++++++++++++ .../AddCustomTiny/add_custom_tiling.h | 19 ++++ .../1_add_frameworklaunch/README.md | 2 + 6 files changed, 294 insertions(+) create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp create mode 100644 operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt new file mode 100644 index 000000000..38e13a85e --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.16.0) +project(opp) + +set(ASCEND_COMPUTE_UNIT ascend910b) +find_package(ASC REQUIRED) + +npu_op_package(${vendor_name} + TYPE RUN +) + +file(GLOB host_ops_srcs ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_host.cpp) +npu_op_code_gen( + SRC ${host_ops_srcs} + PACKAGE ${vendor_name} + OUT_DIR ${ASCEND_AUTOGEN_PATH} + OPTIONS + OPS_PRODUCT_NAME ${ASCEND_COMPUTE_UNIT} +) + +file(GLOB autogen_aclnn_srcs ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +set_source_files_properties(${autogen_aclnn_srcs} PROPERTIES GENERATED TRUE) +npu_op_library(cust_opapi ACLNN + ${autogen_aclnn_srcs} +) + +npu_op_library(cust_optiling TILING + ${host_ops_srcs} +) + +npu_op_kernel_library(ascendc_kernels + SRC_BASE ${CMAKE_SOURCE_DIR}/ + TILING_LIBRARY cust_optiling +) + +npu_op_kernel_sources(ascendc_kernels + OP_NAME AddCustom + KERNEL_FILE add_custom_kernel.cpp +) + +npu_op_package_add(${vendor_name} + LIBRARY + cust_opapi + cust_optiling + ascendc_kernels +) diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md new file mode 100644 index 000000000..d0e4472c1 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md @@ -0,0 +1,86 @@ +## 简化Add算子直调样例 +本样例以Add算子为示例,展示了简单、灵活的算子编译流程。 +**注意:本样例仅支持CANN8.3及以上版本。** +## 目录结构介绍 +``` +├── AddCustomTiny +│ ├── add_cutsom_host.cpp // host侧编译源码文件 +│ ├── add_custom_tiling.h // host侧编译tiling头文件 +│ ├── add_custom_kernel.cpp // kernel侧编译源码文件 +│ ├── CMakeLists.txt // 编译工程文件 +│ └── readme.md // 算子实现及测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输出z8 * 2048floatND
核函数名add_custom
+ +## 代码实现介绍 +- kernel实现 + Add算子的数学表达式为: + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 +- tiling实现 + TilingData参数设计,TilingData参数本质上是和并行数据切分相关的参数,本示例算子使用了2个tiling参数:totalLength、tileNum。totalLength是指需要计算的数据量大小,tileNum是指每个核上总计算数据分块个数。比如,totalLength这个参数传递到kernel侧后,可以通过除以参与计算的核数,得到每个核上的计算量,这样就完成了多核数据的切分。 + + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 编译样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + + - 样例执行 + ```bash + rm -rf build && mkdir build && cd build + cmake .. && make -j binary package + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/16 | 新增readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp new file mode 100644 index 000000000..1cc87f4fb --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp @@ -0,0 +1,56 @@ +/** + * @file add_custom_host.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "add_custom_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/tiling_api.h" + +namespace optiling { +const uint32_t BLOCK_DIM = 8; +const uint32_t TILE_NUM = 8; +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + AddCustomTilingData *tiling = context->GetTilingData(); + uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + context->SetBlockDim(BLOCK_DIM); + tiling->totalLength = totalLength; + tiling->tileNum = TILE_NUM; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Input("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Output("z") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + + this->AICore() + .SetTiling(optiling::TilingFunc) + .AddConfig("ascend910") + .AddConfig("ascend310p") + .AddConfig("ascend310b") + .AddConfig("ascend910b"); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp new file mode 100644 index 000000000..22a9876fe --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp @@ -0,0 +1,86 @@ +/** + * @file add_custom_kernel.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "add_custom_tiling.h" +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = tileNum; + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(AddCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + KernelAdd op; + op.Init(x, y, z, tilingData.totalLength, tilingData.tileNum); + op.Process(); +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h new file mode 100644 index 000000000..d80ecbee4 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h @@ -0,0 +1,19 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include + +struct AddCustomTilingData { + uint32_t totalLength; + uint32_t tileNum; +}; + +#endif // ADD_CUSTOM_TILING_H diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md index de724d24f..5f1223f77 100644 --- a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md @@ -9,6 +9,7 @@ │ ├── AclOfflineModel // 通过aclopExecuteV2调用的方式调用AddCustom算子 │ ├── AclOnlineModel // 通过aclopCompile调用的方式调用AddCustom算子 │ ├── AddCustom // AddCustom算子工程 +│ ├── AddCustomTiny // AddCustom自定义算子工程极简样例 │ ├── PytorchInvocation // 通过pytorch调用的方式调用AddCustom算子 │ ├── TensorflowInvocation // 通过tensorflow调用的方式调用AddCustom算子 │ ├── CppExtensionInvocation // 通过CppExtension调用的方式调用AddCustom算子 @@ -150,3 +151,4 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 | 2024/11/11 | 样例目录调整 | | 2024/11/18 | 算子工程改写为由msOpGen生成 | | 2025/01/17 | 新增CppExtensionInvocation样例 | +| 2025/9/17 | 新增AddCustomTiny极简工程样例 | \ No newline at end of file -- Gitee From d059db50fde2e76fe99336e65c1a17913e769081 Mon Sep 17 00:00:00 2001 From: Chen Ning Date: Thu, 18 Sep 2025 11:05:38 +0000 Subject: [PATCH 7/7] add namespace AscendC::tiling to sample Signed-off-by: Chen Ning --- .../MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h index 8f32f3418..4ea1394d9 100644 --- a/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h +++ b/operator/ascendc/0_introduction/10_matmul_frameworklaunch/MatmulCustomMultiCore/op_kernel/matmul_custom_tiling.h @@ -15,7 +15,7 @@ struct MatmulCustomTilingData { uint64_t localMemSize; - TCubeTiling cubeTilingData; + AscendC::tiling::TCubeTiling cubeTilingData; }; #endif // MATMUL_TILING_H \ No newline at end of file -- Gitee