From f79ddc2879d8975863460ecbf823deae57fc115b Mon Sep 17 00:00:00 2001 From: mengyuanli Date: Mon, 8 Sep 2025 14:38:51 +0800 Subject: [PATCH 1/3] dlopen ms_internal_kernels(compile success) use multy load not link and find ms_kernels_internal remove internal_kernels operator api fix DynamicInternalOp fix internal::DeviceAddressPtr to OutputsAddrList op_adapter createfunc return internal::InternalOpPtr go back to DynamicInternalOp, not use internal::InternalOpPtr Tiling bug fix add log for find ms lib path fix import bug add log use CreateKernel not use CreateInternalOp call cpp api; mapping cpp & c api replace c wrapper to call internal::InternalOp use c wrapper ---and --- call cpp api load so when init@graph setup@pynative fix tiling and workspace --- ccsrc/CMakeLists.txt | 18 +- ccsrc/base/CMakeLists.txt | 4 +- .../README_dynamic_loading.md | 188 ++++++++++++ .../ms_kernels_internal/dynamic_op_helper.cc | 212 +++++++++++++ .../ms_kernels_internal/dynamic_op_helper.h | 87 ++++++ .../graphmode/internal_kernel_mod.cc | 72 +++-- .../graphmode/internal_kernel_mod.h | 12 +- .../ms_kernels_internal/internal_helper.cc | 8 +- .../internal_kernels_loader.cc | 278 ++++++++++++++++++ .../internal_kernels_loader.h | 155 ++++++++++ .../pyboost/internal_pyboost_runner.cc | 86 +++--- .../pyboost/internal_pyboost_runner.h | 22 +- .../apply_rotary_pos_emb.cc | 19 +- .../ops/ms_kernels_internal/mla/mla_graph.cc | 7 +- .../ms_kernels_internal/mla/mla_pynative.cc | 7 +- .../mla_preprocess/mla_preprocess_common.h | 5 +- .../mla_preprocess/mla_preprocess_graph.cc | 2 +- .../mla_preprocess/mla_preprocess_pynative.cc | 2 +- .../moe_gating_group_topk.cc | 9 +- .../paged_cache_load_common.h | 9 +- .../paged_cache_load_graph.cc | 2 +- .../paged_cache_load_pynative.cc | 2 +- .../reshape_and_cache/reshape_and_cache.cc | 14 +- .../ms_kernels_internal/ring_mla/ring_mla.cc | 5 +- .../ms_kernels_internal/ring_mla/ring_mla.h | 2 +- .../ring_mla/ring_mla_runner.cc | 5 +- .../ring_mla/ring_mla_runner.h | 2 +- .../trans_data/trans_data.cc | 12 +- .../type_cast/type_cast.cc | 2 +- cmake/find_ms_internal_kernels_lib.cmake | 133 +++------ python/ms_custom_ops/__init__.py | 14 +- 31 files changed, 1184 insertions(+), 211 deletions(-) create mode 100644 ccsrc/base/ms_kernels_internal/README_dynamic_loading.md create mode 100644 ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc create mode 100644 ccsrc/base/ms_kernels_internal/dynamic_op_helper.h create mode 100644 ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc create mode 100644 ccsrc/base/ms_kernels_internal/internal_kernels_loader.h diff --git a/ccsrc/CMakeLists.txt b/ccsrc/CMakeLists.txt index b729fdacd..ab5be02d8 100644 --- a/ccsrc/CMakeLists.txt +++ b/ccsrc/CMakeLists.txt @@ -21,12 +21,20 @@ endif() # Include find_lib.cmake to set up MindSpore paths include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/find_ms_internal_kernels_lib.cmake) +# Ensure INTERNAL_KERNEL_LIB_PATH is available to subdirectories +if(DEFINED INTERNAL_KERNEL_LIB_PATH AND INTERNAL_KERNEL_LIB_PATH) + message(STATUS "INTERNAL_KERNEL_LIB_PATH is available for subdirectories: ${INTERNAL_KERNEL_LIB_PATH}") +else() + message(WARNING "INTERNAL_KERNEL_LIB_PATH is not available for subdirectories") +endif() + add_subdirectory(base) add_subdirectory(ops) # Set library and source variables -set(LIB_DIR ${INTERNAL_KERNEL_LIB_PATH}) -set(LIBS ${MINDSPORE_INTERNAL_KERNELS_LIB}) +# Note: We are now using dynamic loading instead of static linking +set(LIB_DIR "") +set(LIBS "") set(SRC_FILES ${BASE_SRC_FILES} ${OPS_SRC_FILES}) set(INCLUDE_DIRS ${BASE_INCLUDE_DIRS} ${INTERNAL_KERNEL_INC_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/.. "${MS_PATH}/include") @@ -53,6 +61,10 @@ foreach(INC_DIR_ITEM ${INCLUDE_DIRS}) endif() endforeach() +# Note: For dynamic loading, we don't need compile-time library path definitions +# The library will be discovered at runtime using multiple strategies +message(STATUS "Using dynamic loading - no compile-time library path required") + message(STATUS "CFLAGS_INCLUDES: ${CFLAGS_INCLUDES}") # ============================================================================= @@ -100,7 +112,7 @@ ms.ops.CustomOpBuilder( op_doc=${DOC_YAML_STRING}, backend='Ascend', cflags='${CFLAGS_INCLUDES}', - ldflags='-L${INTERNAL_KERNEL_LIB_PATH} -l${LIBS}', + ldflags='', build_dir='${BUILD_EXTENSION_DIR}', debug_mode=${ENABLE_DEBUG} ).build() diff --git a/ccsrc/base/CMakeLists.txt b/ccsrc/base/CMakeLists.txt index b2035bd40..5b800e944 100644 --- a/ccsrc/base/CMakeLists.txt +++ b/ccsrc/base/CMakeLists.txt @@ -2,6 +2,8 @@ # Base Source Files Collection # ============================================================================= +# Note: INTERNAL_KERNEL_LIB_PATH compile definition is handled in main CMakeLists.txt via CFLAGS_INCLUDES + # Collect all .cc files recursively from the base directory file(GLOB_RECURSE BASE_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.cc") @@ -31,5 +33,3 @@ set(BASE_INCLUDE_DIRS # Make include directories available to parent scope set(BASE_INCLUDE_DIRS ${BASE_INCLUDE_DIRS} PARENT_SCOPE) - -message(STATUS "BASE_INCLUDE_DIRS: ${BASE_INCLUDE_DIRS}") diff --git a/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md b/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md new file mode 100644 index 000000000..7846183ca --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md @@ -0,0 +1,188 @@ +# ms_kernels_internal 动态加载功能 + +本文档介绍了如何使用dlopen动态加载ms_kernels_internal库,分别在图模式和Pyboost模式中进行调用。 + +## 概述 + +为了实现单例延迟初始化需求,我们实现了使用dlopen动态加载ms_kernels_internal库的功能。这种实现方式有以下优点: + +1. **延迟加载**:库只在需要时才被加载,减少了启动时间和内存占用 +2. **灵活性**:可以在运行时决定是否加载库,而不是在编译时静态链接 +3. **精简设计**:直接在原有类上修改,不引入新的基类,实现更加简洁 + +## 架构设计 + +### 核心组件 + +1. **InternalKernelsLoader**:负责使用dlopen动态加载ms_kernels_internal库,并获取所需的函数指针 +2. **InternalKernelMod**:修改后的图模式内核模块,直接使用动态加载功能 +3. **InternalPyboostRunner**:修改后的Pyboost模式运行器,直接使用动态加载功能 + +### 类图 + +``` ++------------------------+ +------------------------+ +-------------------------+ +| InternalKernelsLoader | | InternalKernelMod | | InternalPyboostRunner | ++------------------------+ +------------------------+ +-------------------------+ +| - library_handle_ | | - internal_op_ | | - internal_op_ | +| - is_loaded_ | | - internal_inputs_shape_| | - op_key_ | ++------------------------+ | - internal_outputs_shape| | - tiling_key_ | +| + Initialize() | +------------------------+ | - hash_map_ | +| + LoadLibrary() | | + Init() | +-------------------------+ +| + GetFunctionPointer() | | + Resize() | | + Setup() | +| + CreateInternalOp() | | + Launch() | | + GetOrCreateKernel() | +| + Calc...Hash() | +------------------------+ | + LaunchKernel() | ++------------------------+ +-------------------------+ + ^ ^ + | | + +----------------------------------------------------------------+ +``` + +## 使用方法 + +### 1. 初始化动态加载 + +在使用动态加载功能之前,需要先初始化InternalKernelsLoader: + +```cpp +#include "internal_kernels_loader.h" + +// 获取单例 +auto &loader = InternalKernelsLoader::GetInstance(); + +// 初始化动态加载 +if (!loader.Initialize()) { + MS_LOG(ERROR) << "Failed to initialize dynamic loader"; + return -1; +} +``` + +### 2. 图模式下的使用 + +使用修改后的InternalKernelMod类: + +```cpp +#include "internal_kernel_mod.h" + +// 创建内核模块 +auto kernel_mod = std::make_shared(); + +// 正常使用内核模块 +std::vector inputs = ...; +std::vector outputs = ...; + +// 初始化 +if (!kernel_mod->Init(inputs, outputs)) { + MS_LOG(ERROR) << "Failed to initialize kernel"; + return -1; +} + +// 调整大小 +if (kernel_mod->Resize(inputs, outputs) != KRET_OK) { + MS_LOG(ERROR) << "Failed to resize kernel"; + return -1; +} + +// 启动内核 +std::vector workspace = ...; +void *stream_ptr = ...; +if (!kernel_mod->Launch(inputs, workspace, outputs, stream_ptr)) { + MS_LOG(ERROR) << "Failed to launch kernel"; + return -1; +} +``` + +### 3. Pyboost模式下的使用 + +使用修改后的InternalPyboostRunner类: + +```cpp +#include "internal_pyboost_runner.h" + +// 创建运行器 +auto runner = std::make_shared(); + +// 设置操作名称和参数 +std::string op_name = "your_op_name"; +runner->Setup(op_name, args...); + +// 准备输入输出张量 +TensorList inputs = ...; +TensorList outputs = ...; + +// 获取或创建内核 +runner->GetOrCreateKernel(inputs, outputs); + +// 启动内核 +runner->LaunchKernel(); +``` + +## 示例代码 + +完整的示例代码请参考 `dynamic_loading_example.cc` 文件。 + +## 编译和运行 + +### 编译要求 + +1. 支持C++17标准的编译器 +2. 支持dlopen的操作系统(如Linux) +3. ms_kernels_internal库的动态链接库文件(libmindspore_internal_kernels.so) + +### 编译步骤 + +1. 确保ms_kernels_internal库的动态链接库文件在系统路径中,或设置LD_LIBRARY_PATH环境变量 + +```bash +export LD_LIBRARY_PATH=/path/to/ms_kernels_internal/lib:$LD_LIBRARY_PATH +``` + +2. 编译示例代码 + +```bash +cd /home/lmy/custom_op/akg/ccsrc/base/ms_kernels_internal +g++ -std=c++17 -I./include -L./lib -o dynamic_loading_example dynamic_loading_example.cc -ldl -lmindspore_internal_kernels +``` + +3. 运行示例 + +```bash +./dynamic_loading_example +``` + +## 注意事项 + +1. **库路径**:确保ms_kernels_internal库的动态链接库文件在系统路径中,或通过LD_LIBRARY_PATH环境变量指定 +2. **线程安全**:SimpleDynamicLoader是线程安全的,可以在多线程环境中使用 +3. **错误处理**:在使用动态加载功能时,务必检查返回值 +4. **资源释放**:动态加载的库会在程序退出时自动释放 + +## 故障排除 + +### 常见错误 + +1. "Failed to load library: ..." + - 检查库文件是否存在 + - 检查LD_LIBRARY_PATH环境变量是否正确设置 + - 检查库文件的权限 + +2. "Failed to get function pointer: ..." + - 检查库文件版本是否正确 + - 检查函数名称是否正确 + +3. "Function call failed: ..." + - 检查函数参数是否正确 + - 检查库文件是否正确初始化 + +### 调试方法 + +1. 检查库加载状态 + +```cpp +auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); +MS_LOG(INFO) << "Library loaded: " << loader.IsInitialized(); +``` + +## 总结 + +通过使用dlopen动态加载ms_kernels_internal库,我们实现了单例延迟初始化需求,提供了更灵活的使用方式。这种实现方式直接在原有类上修改,不引入新的基类,实现更加简洁,符合精简版的设计要求。 \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc new file mode 100644 index 000000000..bea778b82 --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc @@ -0,0 +1,212 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dynamic_op_helper.h" +#include + +namespace ms_custom_ops { + +// DynamicInternalOp implementation - direct method calls to InternalOp smart pointer + +internal::InternalStatus DynamicInternalOp::Init() { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return internal::kInternalError; + } + + // 使用mangled符号直接调用C++方法,避免编译时依赖 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef internal::InternalStatus (*InitFunc)(void*); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp4InitEv"; + auto init_func = loader.GetOpCreateFunction(mangled_name); + + if (!init_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::Init function via mangled symbol: " << mangled_name; + return internal::kInternalError; + } + + return init_func(op_ptr_.get()); // 传递裸指针给C++方法 +} + +internal::InternalStatus DynamicInternalOp::UpdateParam(const void* param) { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return internal::kInternalError; + } + + // 使用mangled符号直接调用C++方法 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef internal::InternalStatus (*UpdateParamFunc)(void*, const void*); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp11UpdateParamEPKv"; + auto update_func = loader.GetOpCreateFunction(mangled_name); + + if (!update_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::UpdateParam function via mangled symbol: " << mangled_name; + return internal::kInternalError; + } + + return update_func(op_ptr_.get(), param); +} + +internal::InternalStatus DynamicInternalOp::UpdateShape(const internal::ShapeInfoList &inputs_shape, + const internal::ShapeInfoList &outputs_shape) { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return internal::kInternalError; + } + + // 使用mangled符号直接调用C++方法 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef internal::InternalStatus (*UpdateShapeFunc)(void*, const std::vector>&, const std::vector>&); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp11UpdateShapeERKSt6vectorIS2_IlSaIlEESaIS4_EES8_"; + auto update_shape_func = loader.GetOpCreateFunction(mangled_name); + + if (!update_shape_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::UpdateShape function via mangled symbol: " << mangled_name; + return internal::kInternalError; + } + + return update_shape_func(op_ptr_.get(), inputs_shape, outputs_shape); +} + +size_t DynamicInternalOp::GetTilingSize() { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return 0; + } + + auto& loader = InternalKernelsLoader::GetInstance(); + typedef size_t (*GetTilingSizeFunc)(void*); + + // Try to find GetTilingSize method via mangled symbol + std::string mangled_name = "_ZN9mindspore8internal10InternalOp13GetTilingSizeEv"; + auto get_tiling_size_func = loader.GetOpCreateFunction(mangled_name); + + if (get_tiling_size_func) { + return get_tiling_size_func(op_ptr_.get()); + } + + MS_LOG(WARNING) << "GetTilingSize: mangled symbol not found, returning default size 1024"; + return 1024; // Return a reasonable default instead of 0 +} + +internal::InternalStatus DynamicInternalOp::Tiling(internal::RawHostAddr host_ptr, + internal::HostRunInfoPtr *run_info_ptr) { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return internal::kInternalError; + } + + // 使用mangled符号直接调用C++方法 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef internal::InternalStatus (*TilingFunc)(void*, void*, internal::HostRunInfoPtr*); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp6TilingEPvPSt10shared_ptrINS0_11HostRunInfoEE"; + auto tiling_func = loader.GetOpCreateFunction(mangled_name); + + if (!tiling_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::Tiling function via mangled symbol: " << mangled_name; + return internal::kInternalError; + } + + return tiling_func(op_ptr_.get(), host_ptr, run_info_ptr); +} + +void DynamicInternalOp::SetTilingInfo(const internal::TilingInfoPtr& tiling_info) { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return; + } + + // 使用mangled符号直接调用C++方法 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef void (*SetTilingInfoFunc)(void*, const internal::TilingInfoPtr&); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp13SetTilingInfoERKSt10shared_ptrINS0_10TilingInfoEE"; + auto set_tiling_func = loader.GetOpCreateFunction(mangled_name); + + if (!set_tiling_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::SetTilingInfo function via mangled symbol: " << mangled_name; + return; + } + + set_tiling_func(op_ptr_.get(), tiling_info); +} + +std::vector DynamicInternalOp::GetWorkspaceSize() { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return {}; + } + + auto& loader = InternalKernelsLoader::GetInstance(); + typedef std::vector (*GetWorkspaceSizeFunc)(void*); + + // Try to find GetWorkspaceSize method via mangled symbol + std::string mangled_name = "_ZN9mindspore8internal10InternalOp16GetWorkspaceSizeEv"; + auto get_workspace_size_func = loader.GetOpCreateFunction(mangled_name); + + if (get_workspace_size_func) { + return get_workspace_size_func(op_ptr_.get()); + } + + MS_LOG(WARNING) << "GetWorkspaceSize: mangled symbol not found, returning empty vector"; + return {}; +} + +internal::InternalStatus DynamicInternalOp::Launch(const internal::InputsAddrList& inputs, + const internal::OutputsAddrList& outputs, + const internal::WsAddrList& workspace, + void* stream_ptr, const std::string& kernel_name) { + if (!op_ptr_) { + MS_LOG(ERROR) << "InternalOp pointer is null"; + return internal::kInternalError; + } + + // 使用mangled符号直接调用C++方法 + auto& loader = InternalKernelsLoader::GetInstance(); + typedef internal::InternalStatus (*LaunchFunc)(void*, const std::vector&, const std::vector&, + const std::vector&, void*, const std::string&); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp6LaunchERKSt6vectorIPvSaIS3_EES7_S7_S3_RKSs"; + auto launch_func = loader.GetOpCreateFunction(mangled_name); + + if (!launch_func) { + MS_LOG(ERROR) << "Failed to load InternalOp::Launch function via mangled symbol: " << mangled_name; + return internal::kInternalError; + } + + return launch_func(op_ptr_.get(), inputs, outputs, workspace, stream_ptr, kernel_name); +} + +std::string GetMangledSymbolName(const std::string& func_name) { + // Try to find symbol by pattern matching in the loaded library + auto& loader = InternalKernelsLoader::GetInstance(); + + // First try direct name lookup (in case library has extern "C" wrappers) + void* direct_symbol = loader.GetFunctionPointer(func_name); + if (direct_symbol) { + return func_name; + } + + // If direct lookup fails, search for mangled symbols containing the function name + std::string mangled_symbol = loader.FindMangledSymbol(func_name); + if (!mangled_symbol.empty()) { + return mangled_symbol; + } + + MS_LOG(WARNING) << "No symbol found for: " << func_name; + return func_name; +} + +} // namespace ms_custom_ops \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h new file mode 100644 index 000000000..7fbcc0e6e --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h @@ -0,0 +1,87 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MS_CUSTOM_OPS_DYNAMIC_OP_HELPER_H_ +#define MS_CUSTOM_OPS_DYNAMIC_OP_HELPER_H_ + +#include "internal_kernels_loader.h" +#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" +#include "mindspore/core/include/utils/log_adapter.h" + +namespace ms_custom_ops { + +// Forward declare for wrapper functions +class DynamicInternalOp; +using DynamicInternalOpPtr = std::shared_ptr; + +/** + * @brief Get mangled symbol name for C++ function + * @param func_name Function name (e.g., "CreateApplyRotaryPosEmbOp") + * @return Mangled symbol name + */ +std::string GetMangledSymbolName(const std::string &func_name); + +/** + * @brief Dynamic wrapper for InternalOp that uses function pointers + */ +class DynamicInternalOp { +public: + DynamicInternalOp(std::shared_ptr op_ptr) : op_ptr_(op_ptr) {} + + // Method wrappers - will be implemented using dynamic function calls + internal::InternalStatus Init(); + internal::InternalStatus UpdateParam(const void* param); + internal::InternalStatus UpdateShape(const internal::ShapeInfoList &inputs_shape, + const internal::ShapeInfoList &outputs_shape); + size_t GetTilingSize(); + internal::InternalStatus Tiling(internal::RawHostAddr host_ptr, internal::HostRunInfoPtr *run_info_ptr); + void SetTilingInfo(const internal::TilingInfoPtr& tiling_info); + std::vector GetWorkspaceSize(); + internal::InternalStatus Launch(const internal::InputsAddrList& inputs, + const internal::OutputsAddrList& outputs, + const internal::WsAddrList& workspace, + void* stream_ptr, const std::string& kernel_name); + + + private: + std::shared_ptr op_ptr_; // Smart pointer to the real InternalOp +}; + +/** + * @brief Helper macro for dynamic op creation - returns DynamicInternalOpPtr + */ +#define CALL_DYNAMIC_OP_INTERNAL(FUNC_NAME, PARAM_TYPE, inputs_param, outputs_param, ...) \ + ({ \ + auto &loader = InternalKernelsLoader::GetInstance(); \ + typedef std::shared_ptr (*FUNC_NAME##_Type)( \ + const internal::InputsImmutableInfoList &, const internal::OutputsImmutableInfoList &, \ + const internal::PARAM_TYPE &, const std::string &); \ + std::string mangled_name = GetMangledSymbolName(#FUNC_NAME); \ + auto create_func = loader.GetOpCreateFunction(mangled_name); \ + DynamicInternalOpPtr result = nullptr; \ + if (!create_func) { \ + MS_LOG(EXCEPTION) << "Failed to load " << #FUNC_NAME << " function with mangled name: " << mangled_name; \ + } else { \ + auto op_ptr = create_func(inputs_param, outputs_param, __VA_ARGS__); \ + if (op_ptr) { \ + result = std::make_shared(op_ptr); \ + } \ + } \ + result; \ + }) + +} // namespace ms_custom_ops + +#endif // MS_CUSTOM_OPS_DYNAMIC_OP_HELPER_H_ \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc index 923aed812..431332ac9 100644 --- a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc +++ b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc @@ -15,6 +15,7 @@ */ #include "internal_kernel_mod.h" +#include "internal_kernels_loader.h" #include #include #include "mindspore/core/include/utils/ms_context.h" @@ -26,6 +27,16 @@ namespace ms_custom_ops { SimpleSpinLock InternalKernelMod::lock_ = SimpleSpinLock(); bool InternalKernelMod::Init(const std::vector &inputs, const std::vector &outputs) { + // Load internal kernels library early to initialize device environment + auto& loader = InternalKernelsLoader::GetInstance(); + if (!loader.IsInitialized()) { + MS_LOG(INFO) << "Loading internal kernels library during InternalKernelMod::Init"; + if (!loader.Initialize()) { + MS_LOG(ERROR) << "Failed to load internal kernels library in Init: " << loader.GetLastError(); + return false; + } + } + auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); auto soc = ms_context->ascend_soc_version(); @@ -195,27 +206,48 @@ void InternalKernelMod::GetOrGenerateTiling(const std::vector &i void InternalKernelMod::GetInternalKernel(const std::vector &inputs, const std::vector &outputs) { - if (IsNeedRecreate(inputs, outputs)) { - internal::InputsImmutableInfoList inputs_ii; - internal::OutputsImmutableInfoList outputs_ii; - for (auto i : kernel_inputs_index_) { - auto dtype = TransInternalDataType(inputs[i]->dtype_id()); - auto format = TransInternalFormat(inputs[i]->format()); - inputs_ii.emplace_back(dtype, format); - } + // 使用动态加载的实现 + auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); + if (loader.Initialize()) { + if (IsNeedRecreate(inputs, outputs)) { + internal::InputsImmutableInfoList inputs_ii; + internal::OutputsImmutableInfoList outputs_ii; + std::vector input_shapes; + std::vector output_shapes; + + for (auto i : kernel_inputs_index_) { + auto dtype = TransInternalDataType(inputs[i]->dtype_id()); + auto format = TransInternalFormat(inputs[i]->format()); + inputs_ii.emplace_back(dtype, format); + auto shape = TransInternalShape(inputs[i]->GetShapeVector()); + if (inputs[i]->dtype_id() == kMetaTypeNone) { + shape = {}; + } + input_shapes.push_back(shape); + } - for (auto i : kernel_outputs_index_) { - auto dtype = TransInternalDataType(outputs[i]->dtype_id()); - auto format = TransInternalFormat(outputs[i]->format()); - outputs_ii.emplace_back(dtype, format); - } - internal_op_ = CreateKernel(inputs_ii, outputs_ii, inputs, outputs); - MS_EXCEPTION_IF_NULL(internal_op_); - auto status = internal_op_->Init(); - if (status != internal::kInternalOk) { - internal_op_ = nullptr; - MS_LOG(ERROR) << "Init InternalKernel failed, kenrel_name: " << kernel_name_; + for (auto i : kernel_outputs_index_) { + auto dtype = TransInternalDataType(outputs[i]->dtype_id()); + auto format = TransInternalFormat(outputs[i]->format()); + outputs_ii.emplace_back(dtype, format); + auto shape = TransInternalShape(outputs[i]->GetShapeVector()); + if (outputs[i]->dtype_id() == kMetaTypeNone) { + shape = {}; + } + output_shapes.push_back(shape); + } + + // 调用虚函数让子类创建具体内核 + internal_op_ = CreateKernel(inputs_ii, outputs_ii, inputs, outputs); + MS_EXCEPTION_IF_NULL(internal_op_); + auto status = internal_op_->Init(); + if (status != internal::kInternalOk) { + internal_op_ = nullptr; + MS_LOG(ERROR) << "Init InternalKernel failed, kenrel_name: " << kernel_name_; + } } + } else { + MS_LOG(ERROR) << "Failed to initialize dynamic loader: " << loader.GetLastError(); } } @@ -311,6 +343,8 @@ bool InternalKernelMod::Launch(const std::vector &inputs, const } UpdateAddr(inputs, outputs, workspace); + + // 使用动态加载的实现 internal::InternalStatus status = internal_op_->Launch(internal_inputs_addr_, internal_outputs_addr_, internal_wss_addr_, stream_ptr, fullname_); return (status == internal::InternalStatus::kInternalOk); diff --git a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h index daf69a78a..96ac2ef38 100644 --- a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h +++ b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h @@ -29,6 +29,8 @@ #include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" #include "mindspore/ccsrc/tools/profiler/profiling.h" #include "acl/acl_mdl.h" +#include "internal_kernels_loader.h" +#include "dynamic_op_helper.h" namespace ms_custom_ops { using namespace mindspore::ops; @@ -58,10 +60,10 @@ class InternalKernelMod : public KernelMod { virtual bool UpdateParam(const std::vector &inputs, const std::vector &outputs) { return true; } - virtual internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const std::vector &ms_inputs, - const std::vector &ms_outputs) { + virtual DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + const internal::OutputsImmutableInfoList &outputs, + const std::vector &ms_inputs, + const std::vector &ms_outputs) { return nullptr; } @@ -72,7 +74,7 @@ class InternalKernelMod : public KernelMod { std::vector kernel_inputs_index_; std::vector kernel_outputs_index_; - internal::InternalOpPtr internal_op_{nullptr}; + DynamicInternalOpPtr internal_op_{nullptr}; internal::ShapeInfoList internal_inputs_shape_; internal::ShapeInfoList internal_outputs_shape_; internal::InputsAddrList internal_inputs_addr_; diff --git a/ccsrc/base/ms_kernels_internal/internal_helper.cc b/ccsrc/base/ms_kernels_internal/internal_helper.cc index 4ffe667a4..87f33aad0 100644 --- a/ccsrc/base/ms_kernels_internal/internal_helper.cc +++ b/ccsrc/base/ms_kernels_internal/internal_helper.cc @@ -44,10 +44,7 @@ internal::DataType TransInternalDataType(TypeId ms_type) { {kNumberTypeUInt8, internal::DataType::kTypeUint8}, {kNumberTypeInt64, internal::DataType::kTypeInt64}, {kNumberTypeUInt64, internal::DataType::kTypeUint64}, - {kNumberTypeComplex64, internal::DataType::kTypeComplex64}, - {kNumberTypeComplex128, internal::DataType::kTypeComplex128}, {kNumberTypeBool, internal::DataType::kTypeBool}, - {kMetaTypeNone, internal::DataType::kTypeNone}, }; auto iter = kMSTypeToInternalType.find(ms_type); @@ -66,13 +63,10 @@ internal::TensorFormat TransInternalFormat(Format format) { {NHWC, internal::TensorFormat::kFormatNHWC}, {ND, internal::TensorFormat::kFormatND}, {NC1HWC0, internal::TensorFormat::kFormatNC1HWC0}, - {FRACTAL_Z, internal::TensorFormat::kFormatFRACTAL_Z}, + {FRACTAL_NZ, internal::TensorFormat::kFormatFRACTAL_NZ}, {NC1HWC0_C04, internal::TensorFormat::kFormatNC1HWC0_C04}, - {HWCN, internal::TensorFormat::kFormatHWCN}, {NDHWC, internal::TensorFormat::kFormatNDHWC}, - {FRACTAL_NZ, internal::TensorFormat::kFormatFRACTAL_NZ}, {NCDHW, internal::TensorFormat::kFormatNCDHW}, - {NDC1HWC0, internal::TensorFormat::kFormatNDC1HWC0}, {FRACTAL_Z_3D, internal::TensorFormat::kFormatFRACTAL_Z_3D}, }; diff --git a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc new file mode 100644 index 000000000..53d65be05 --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc @@ -0,0 +1,278 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal_kernels_loader.h" +#include "dynamic_op_helper.h" +#include "mindspore/core/include/utils/log_adapter.h" +#include +#include +#include +#include +#include +#include +#include + +namespace ms_custom_ops { + +InternalKernelsLoader::~InternalKernelsLoader() { + if (library_handle_ != nullptr) { + dlclose(library_handle_); + library_handle_ = nullptr; + } +} + +bool InternalKernelsLoader::Initialize() { + if (initialized_) { + return true; + } + + if (!LoadLibrary()) { + return false; + } + + // 注意:不再加载通用的CreateInternalOp符号,因为动态库中只有具体的操作创建函数 + // 具体的函数将通过GetOpCreateFunction按需加载 + MS_LOG(INFO) << "Dynamic library loaded successfully, functions will be loaded on demand"; + + initialized_ = true; + return true; +} + +bool InternalKernelsLoader::LoadLibrary() { + // 使用多种路径策略加载库 - 加载真正包含CreateXxxOp函数的libms_kernels_internal.so + std::vector library_paths; + + // 1. 标准库路径搜索 (LD_LIBRARY_PATH) + library_paths.push_back("libms_kernels_internal.so"); + + // 2. 从环境变量获取MindSpore路径 + const char* ms_path = getenv("MINDSPORE_PATH"); + if (ms_path != nullptr) { + library_paths.push_back(std::string(ms_path) + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"); + } + + // 3. 运行时Python动态检测MindSpore路径 + std::string runtime_ms_path = GetMindSporePathAtRuntime(); + if (!runtime_ms_path.empty()) { + library_paths.push_back(runtime_ms_path + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"); + } + + for (const auto &path : library_paths) { + library_handle_ = dlopen(path.c_str(), RTLD_LAZY); + if (library_handle_ != nullptr) { + MS_LOG(INFO) << "Successfully loaded library from: " << path; + return true; + } + MS_LOG(DEBUG) << "Failed to load from: " << path << " - " << dlerror(); + } + + SetError("Failed to load library from all attempted paths. Last error: " + std::string(dlerror())); + return false; +} + +void *InternalKernelsLoader::GetFunctionPointer(const std::string &symbol_name) { + if (library_handle_ == nullptr) { + SetError("Library not loaded"); + return nullptr; + } + + dlerror(); // 清除之前的错误 + void *func_ptr = dlsym(library_handle_, symbol_name.c_str()); + char *error = dlerror(); + if (error != nullptr) { + SetError("Failed to get function pointer '" + symbol_name + "': " + std::string(error)); + return nullptr; + } + + return func_ptr; +} + +std::string InternalKernelsLoader::FindMangledSymbol(const std::string &func_name) { + if (library_handle_ == nullptr) { + SetError("Library not loaded"); + return ""; + } + + // 缓存符号查找结果 + static std::unordered_map symbol_cache; + auto cache_it = symbol_cache.find(func_name); + if (cache_it != symbol_cache.end()) { + return cache_it->second; + } + + // 方案1: 尝试已知的符号模式 + dlerror(); // 清除错误 + void* sym = dlsym(library_handle_, func_name.c_str()); + if (sym != nullptr) { + symbol_cache[func_name] = func_name; + return func_name; + } + + // 方案2: 调用外部 nm 命令来获取符号(运行时解析) + std::string lib_path = GetLibraryPath(); + if (!lib_path.empty()) { + std::string cmd = "nm -D " + lib_path + " 2>/dev/null | grep '" + func_name + "' | head -1 | awk '{print $3}'"; + FILE* pipe = popen(cmd.c_str(), "r"); + if (pipe) { + char buffer[1024]; + std::string result; + while (fgets(buffer, sizeof(buffer), pipe)) { + result += buffer; + } + pclose(pipe); + + // 去除换行符 + if (!result.empty() && result.back() == '\n') { + result.pop_back(); + } + + if (!result.empty()) { + // 验证找到的符号确实存在 + void* test_sym = dlsym(library_handle_, result.c_str()); + if (test_sym != nullptr) { + symbol_cache[func_name] = result; + MS_LOG(INFO) << "Found mangled symbol: " << result << " for " << func_name; + return result; + } + } + } + } + + MS_LOG(WARNING) << "Failed to find mangled symbol for: " << func_name; + return ""; // 未找到 +} + +uint64_t InternalKernelsLoader::CalcInternalOpApiHash(const std::string &op_name) { + if (!initialized_) { + SetError("Dynamic loader not initialized"); + return 0; + } + + if (calc_internal_op_api_hash_func_ == nullptr) { + SetError("CalcInternalOpApiHash function not available"); + return 0; + } + + return calc_internal_op_api_hash_func_(op_name); +} + +uint64_t InternalKernelsLoader::CalcInternalOpTilingHash( + const std::string &op_name, + const std::vector &input_shapes, + const std::vector &output_shapes) { + if (!initialized_) { + SetError("Dynamic loader not initialized"); + return 0; + } + + if (calc_internal_op_tiling_hash_func_ == nullptr) { + SetError("CalcInternalOpTilingHash function not available"); + return 0; + } + + return calc_internal_op_tiling_hash_func_(op_name, input_shapes, output_shapes); +} + +std::string InternalKernelsLoader::GetLibraryPath() { + // 使用 dladdr 获取已加载库的路径 + if (library_handle_ == nullptr) { + return ""; + } + + // 获取库中任意一个符号的地址信息 + void* symbol_addr = dlsym(library_handle_, "mindspore"); // 尝试一个通用符号 + if (!symbol_addr) { + // 如果没找到,尝试从 /proc/self/maps 解析 + FILE* maps = fopen("/proc/self/maps", "r"); + if (maps) { + char line[1024]; + while (fgets(line, sizeof(line), maps)) { + if (strstr(line, "libms_kernels_internal.so")) { + char* path = strrchr(line, ' '); + if (path) { + path++; // skip space + char* newline = strchr(path, '\n'); + if (newline) *newline = '\0'; + fclose(maps); + return std::string(path); + } + } + } + fclose(maps); + } + return ""; + } + + Dl_info info; + if (dladdr(symbol_addr, &info) && info.dli_fname) { + return std::string(info.dli_fname); + } + + return ""; +} + +std::string InternalKernelsLoader::GetMindSporePathAtRuntime() { + MS_LOG(DEBUG) << "Attempting to get MindSpore path at runtime"; + + // 执行Python命令获取MindSpore安装路径 + FILE* pipe = popen("python3 -c \"import mindspore as ms; import os; print(os.path.dirname(ms.__file__))\" 2>/dev/null", "r"); + if (pipe == nullptr) { + MS_LOG(ERROR) << "Failed to execute python3 command to get MindSpore path"; + return ""; + } + + char buffer[512]; + std::string result; + + // 读取命令输出 + if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + result = buffer; + // 移除换行符 + if (!result.empty() && result.back() == '\n') { + result.pop_back(); + } + MS_LOG(DEBUG) << "Python command returned path: " << result; + } else { + MS_LOG(ERROR) << "Failed to read output from python3 command"; + } + + pclose(pipe); + + if (result.empty()) { + MS_LOG(ERROR) << "Empty result from MindSpore path detection"; + return ""; + } + + // 验证路径是否包含mindspore且存在 + if (result.find("mindspore") != std::string::npos) { + std::string lib_path = result + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"; + MS_LOG(DEBUG) << "Checking for library at: " << lib_path; + + // 检查文件是否存在 + if (access(lib_path.c_str(), F_OK) == 0) { + MS_LOG(INFO) << "Found MindSpore internal kernels library at: " << lib_path; + return result; + } else { + MS_LOG(ERROR) << "MindSpore internal kernels library not found at: " << lib_path; + } + } else { + MS_LOG(ERROR) << "Invalid MindSpore path (doesn't contain 'mindspore'): " << result; + } + + return ""; +} + +} // namespace ms_custom_ops \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h new file mode 100644 index 000000000..7956e297e --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h @@ -0,0 +1,155 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H +#define MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H + +#include +#include +#include +#include +#include +#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" + +namespace ms_custom_ops { + +using namespace mindspore; + +class DynamicInternalOp; +using DynamicInternalOpPtr = std::shared_ptr; + +typedef uint64_t (*CalcInternalOpApiHashFunc)(const std::string &op_name); + +typedef uint64_t (*CalcInternalOpTilingHashFunc)(const std::string &op_name, + const std::vector &input_shapes, + const std::vector &output_shapes); + +/** + * @brief 简单的动态加载器类,用于加载ms_kernels_internal库 + */ +class InternalKernelsLoader { + public: + static InternalKernelsLoader &GetInstance() { + static InternalKernelsLoader instance; + return instance; + } + + /** + * @brief 初始化动态加载器 + * @return true 成功,false 失败 + */ + bool Initialize(); + + /** + * @brief 检查是否已初始化 + * @return true 已初始化,false 未初始化 + */ + bool IsInitialized() const { return initialized_; } + + /** + * @brief 获取错误信息 + * @return 错误信息 + */ + const std::string &GetLastError() const { return last_error_; } + + + /** + * @brief 计算内部操作API哈希值 + * @param op_name 操作名称 + * @return 哈希值 + */ + uint64_t CalcInternalOpApiHash(const std::string &op_name); + + /** + * @brief 计算内部操作Tiling哈希值 + * @param op_name 操作名称 + * @param input_shapes 输入形状列表 + * @param output_shapes 输出形状列表 + * @return 哈希值 + */ + uint64_t CalcInternalOpTilingHash(const std::string &op_name, + const std::vector &input_shapes, + const std::vector &output_shapes); + + /** + * @brief 获取特定操作创建函数指针 + * @param op_name 操作名称 (如 "CreateAsdReshapeAndCacheOp") + * @return 函数指针,失败返回nullptr + */ + template + FuncType GetOpCreateFunction(const std::string &op_name) { + if (!initialized_ && !Initialize()) { + return nullptr; + } + return reinterpret_cast(GetFunctionPointer(op_name)); + } + + /** + * @brief 获取函数指针 + * @param symbol_name 符号名称 + * @return 函数指针 + */ + void *GetFunctionPointer(const std::string &symbol_name); + + /** + * @brief 通过模式匹配查找mangled符号名 + * @param func_name 函数名 (如 "CreateApplyRotaryPosEmbOp") + * @return mangled符号名,找不到返回空字符串 + */ + std::string FindMangledSymbol(const std::string &func_name); + + private: + InternalKernelsLoader() = default; + ~InternalKernelsLoader(); + InternalKernelsLoader(const InternalKernelsLoader &) = delete; + InternalKernelsLoader &operator=(const InternalKernelsLoader &) = delete; + + /** + * @brief 加载库 + * @return true 成功,false 失败 + */ + bool LoadLibrary(); + + /** + * @brief 运行时获取MindSpore安装路径 + * @return MindSpore安装路径,失败返回空字符串 + */ + std::string GetMindSporePathAtRuntime(); + + /** + * @brief 获取当前加载库的文件路径 + * @return 库文件路径,失败返回空字符串 + */ + std::string GetLibraryPath(); + + /** + * @brief 设置错误信息 + * @param error 错误信息 + */ + void SetError(const std::string &error) { last_error_ = error; } + + void *library_handle_ = nullptr; + bool initialized_ = false; + std::string last_error_; + + // 函数指针 + CalcInternalOpApiHashFunc calc_internal_op_api_hash_func_ = nullptr; + CalcInternalOpTilingHashFunc calc_internal_op_tiling_hash_func_ = nullptr; +}; + +} // namespace ms_custom_ops + +#endif // MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc index f7337764a..09eebef2f 100644 --- a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc +++ b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc @@ -15,50 +15,60 @@ */ #include "internal_pyboost_runner.h" +#include "internal_kernels_loader.h" namespace ms_custom_ops { void InternalPyboostRunner::GetOrCreateKernel(const TensorList &inputs, const TensorList &outputs) { - auto key = GetOrGenerateOpKey(op_key_); - auto it = hash_map_.find(key); - if (it != hash_map_.end()) { - internal_op_ = it->second; - MS_LOG(DEBUG) << "Internal Op [" << this->op_name() << "] hit cache"; - } else { - MS_LOG(DEBUG) << "Internal Op [" << this->op_name() << "] miss cache"; - TransDataType(inputs, outputs); - UpdateArgImmutableInfo(&inputs_ii_, inputs, true); - UpdateArgImmutableInfo(&outputs_ii_, outputs); - internal_op_ = CreateKernel(inputs_ii_, outputs_ii_); - MS_EXCEPTION_IF_NULL(internal_op_); - auto status = internal_op_->Init(); - if (status != mindspore::internal::kInternalOk) { - internal_op_ = nullptr; - MS_LOG(EXCEPTION) << "Init internal kernel failed, kernel_name: " - << this->op_name(); - return; + // 使用动态加载的实现 + auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); + if (loader.Initialize()) { + auto key = GetOrGenerateOpKey(op_key_); + auto it = hash_map_.find(key); + if (it != hash_map_.end()) { + internal_op_ = it->second; + MS_LOG(DEBUG) << "Internal Op [" << this->op_name() << "] hit cache"; + } else { + MS_LOG(DEBUG) << "Internal Op [" << this->op_name() << "] miss cache"; + TransDataType(inputs, outputs); + UpdateArgImmutableInfo(&inputs_ii_, inputs, true); + UpdateArgImmutableInfo(&outputs_ii_, outputs); + + // 准备形状信息 + internal_inputs_shape_.clear(); + internal_outputs_shape_.clear(); + internal_inputs_shape_.resize(inputs.size()); + internal_outputs_shape_.resize(outputs.size()); + TransInternalShapes(&internal_inputs_shape_, inputs, true); + TransInternalShapes(&internal_outputs_shape_, outputs, false); + + // 调用虚函数让子类创建具体内核 + internal_op_ = CreateKernel(inputs_ii_, outputs_ii_); + MS_EXCEPTION_IF_NULL(internal_op_); + auto status = internal_op_->Init(); + if (status != mindspore::internal::kInternalOk) { + internal_op_ = nullptr; + MS_LOG(EXCEPTION) << "Init internal kernel failed, kernel_name: " + << this->op_name(); + return; + } + hash_map_[key] = internal_op_; } - hash_map_[key] = internal_op_; - } - internal_inputs_shape_.clear(); - internal_outputs_shape_.clear(); - internal_inputs_shape_.resize(inputs.size()); - internal_outputs_shape_.resize(outputs.size()); - TransInternalShapes(&internal_inputs_shape_, inputs, true); - TransInternalShapes(&internal_outputs_shape_, outputs, false); + if (!UpdateParam()) { + MS_LOG(EXCEPTION) << "UpdateParam failed, kernel_name: " << this->op_name(); + } + auto internal_ret = internal_op_->UpdateShape(internal_inputs_shape_, + internal_outputs_shape_); + if (internal_ret != mindspore::internal::kInternalOk) { + MS_LOG(EXCEPTION) << "InternalKernel UpdateShape failed, kernel_name: " + << this->op_name(); + } - if (!UpdateParam()) { - MS_LOG(EXCEPTION) << "UpdateParam failed, kernel_name: " << this->op_name(); - } - auto internal_ret = internal_op_->UpdateShape(internal_inputs_shape_, - internal_outputs_shape_); - if (internal_ret != mindspore::internal::kInternalOk) { - MS_LOG(EXCEPTION) << "InternalKernel UpdateShape failed, kernel_name: " - << this->op_name(); + tiling_cache_item_ = GetOrGenerateTiling(); + } else { + MS_LOG(ERROR) << "Failed to initialize dynamic loader: " << loader.GetLastError(); } - - tiling_cache_item_ = GetOrGenerateTiling(); } size_t InternalPyboostRunner::CalcWorkspace() { @@ -203,7 +213,7 @@ void InternalPyboostRunner::UpdateArgImmutableInfo( } void InternalPyboostRunner::GetWorkspace( - const internal::InternalOpPtr &internal_op, + const DynamicInternalOpPtr &internal_op, internal::WsAddrList *internal_wss_addr) { auto workspace_ptr = this->workspace_ptr(); if (workspace_ptr == nullptr) { @@ -224,6 +234,8 @@ void InternalPyboostRunner::GetWorkspace( void InternalPyboostRunner::LaunchKernel() { MS_EXCEPTION_IF_NULL(tiling_cache_item_); MS_EXCEPTION_IF_NULL(internal_op_); + + // 使用动态加载的实现 internal::InputsAddrList inputs_addr; internal::OutputsAddrList outputs_addr; InternalPyboostRunner::UpdateAddr(&inputs_addr, this->inputs()); diff --git a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h index c5ff5b145..4baf356f1 100644 --- a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h +++ b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h @@ -25,10 +25,12 @@ #include "internal_pyboost_utils.h" #include "internal_spinlock.h" #include "internal_tiling_cache.h" +#include "internal_kernels_loader.h" #include "module.h" #include "mindspore/ccsrc/ms_extension/api.h" #include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" #include "ccsrc/base/ms_kernels_internal/internal_helper.h" +#include "dynamic_op_helper.h" namespace ms_custom_ops { using namespace mindspore; @@ -43,6 +45,16 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { // calculating hash keys template void Setup(const std::string &op_name, const Args &...args) { + // Load internal kernels library early to initialize device environment + auto& loader = InternalKernelsLoader::GetInstance(); + if (!loader.IsInitialized()) { + MS_LOG(INFO) << "Loading internal kernels library during InternalPyboostRunner::Setup"; + if (!loader.Initialize()) { + MS_LOG(ERROR) << "Failed to load internal kernels library in Setup: " << loader.GetLastError(); + return; + } + } + // Calculate hash keys this->op_key_ = CalcInternalOpApiHash(op_name, args...); this->tiling_key_ = CalcInternalOpTilingHash(op_name, args...); @@ -63,8 +75,8 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { void TransDataType(const TensorList &ms_inputs, const TensorList &ms_outputs); TilingCacheItemPtr GetOrGenerateTiling(); - virtual internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) = 0; + virtual DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + const internal::OutputsImmutableInfoList &outputs) = 0; void TransInternalShapes(internal::ShapeInfoList *shapelist, const TensorList &tensorlist, bool is_input = false); static void UpdateAddr(std::vector *addrlist, const TensorList &tensorlist) { @@ -78,14 +90,14 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { } } - void GetWorkspace(const internal::InternalOpPtr &internal_op, internal::WsAddrList *internal_wss_addr); + void GetWorkspace(const DynamicInternalOpPtr &internal_op, internal::WsAddrList *internal_wss_addr); void LaunchKernel() override; uint64_t op_key_{0}; uint64_t tiling_key_{0}; - internal::InternalOpPtr internal_op_{nullptr}; - inline static std::unordered_map hash_map_; + DynamicInternalOpPtr internal_op_{nullptr}; + inline static std::unordered_map hash_map_; internal::DtypeInfoList internal_inputs_dtype_; internal::DtypeInfoList internal_outputs_dtype_; internal::ShapeInfoList internal_inputs_shape_; diff --git a/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc b/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc index 89203e1a4..4028821fa 100644 --- a/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc +++ b/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc @@ -24,6 +24,7 @@ #include #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "ccsrc/utils/utils.h" #include "mindspore/core/include/mindapi/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" @@ -81,10 +82,10 @@ class CustomApplyRotaryPosEmb : public InternalKernelMod { } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const std::vector &ms_inputs, - const std::vector &ms_outputs) override { + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + const internal::OutputsImmutableInfoList &outputs, + const std::vector &ms_inputs, + const std::vector &ms_outputs) override { internal::ApplyRotaryPosEmbParam param; auto cos_format = ms_inputs.at(static_cast(ApplyRotaryPosEmbQueryInputIndex::kApplyRotaryPosEmbCosFormatIndex)); @@ -94,7 +95,8 @@ class CustomApplyRotaryPosEmb : public InternalKernelMod { MS_LOG(EXCEPTION) << "ApplyRotaryPosEmb [cos_format]'s dtype wrong, expect int64, but got: " << cos_format->dtype_id(); } - return internal::CreateApplyRotaryPosEmbOp(inputs, outputs, param, internal::kInternalApplyRotaryPosEmbOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateApplyRotaryPosEmbOp, ApplyRotaryPosEmbParam, inputs, outputs, param, + internal::kInternalApplyRotaryPosEmbOpName); } }; } // namespace ms_custom_ops @@ -117,11 +119,12 @@ class ApplyRotaryPosEmbRunner : public InternalPyboostRunner { void SetCosFormat(const int32_t &cos_format) { this->cos_format_ = cos_format; } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + const internal::OutputsImmutableInfoList &outputs) override { internal::ApplyRotaryPosEmbParam param; param.cos_format = this->cos_format_; - return internal::CreateApplyRotaryPosEmbOp(inputs, outputs, param, internal::kInternalApplyRotaryPosEmbOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateApplyRotaryPosEmbOp, ApplyRotaryPosEmbParam, inputs, outputs, param, + internal::kInternalApplyRotaryPosEmbOpName); } private: diff --git a/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc b/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc index a7b26464d..2b64b3b32 100644 --- a/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc @@ -22,6 +22,7 @@ #include "ccsrc/ops/ms_kernels_internal/mla/mla_common.h" #include "ccsrc/ops/ms_kernels_internal/utils/attention_utils.h" #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "mindspore/core/include/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" #include "mindspore/ops/ops_utils/op_utils.h" @@ -197,7 +198,7 @@ class Mla : public InternalKernelMod { ~Mla() = default; protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { @@ -218,10 +219,10 @@ class Mla : public InternalKernelMod { auto inputs_new = inputs; inputs_new[kMlaInputKvCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); inputs_new[kMlaInputKropeIndex].SetFormat(internal::kFormatFRACTAL_NZ); - return internal::CreateMLAOp(inputs_new, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, internal::kInternalMLAOpName); } - return internal::CreateMLAOp(inputs, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, internal::kInternalMLAOpName); } bool UpdateParam(const std::vector &inputs, const std::vector &outputs) override { diff --git a/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc b/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc index d6c51be49..c9c4928d2 100644 --- a/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc @@ -20,6 +20,7 @@ #include #include "ccsrc/ops/ms_kernels_internal/mla/mla_common.h" #include "ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "ccsrc/utils/utils.h" #include "mindspore/ccsrc/ms_extension/api.h" #include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" @@ -64,16 +65,16 @@ class MlaRunner : public InternalPyboostRunner { } } - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { created_flag_ = true; if (input_format_ == kKVFormatNZ) { auto inputs_new = inputs; inputs_new[kMlaInputKvCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); inputs_new[kMlaInputKropeIndex].SetFormat(internal::kFormatFRACTAL_NZ); - return internal::CreateMLAOp(inputs_new, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, internal::kInternalMLAOpName); } - return mindspore::internal::CreateMLAOp(inputs, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, internal::kInternalMLAOpName); } private: diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h index e6b0967c1..e53175e41 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h @@ -18,6 +18,7 @@ #define __MS_CUSTOM_OPS_CCSRC_OPS_MS_KERNELS_INTERNAL_MLA_PREPROCESS_H__ #include +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" namespace ms_custom_ops { enum MlaPreprocessInputIndex : size_t { @@ -63,7 +64,7 @@ constexpr int64_t kMlaPreCacheModeQK = 0; constexpr int64_t kMlaPreCacheModeQKSplitQuant = 2; constexpr int64_t kMlaPreCacheModeQKSplitNz = 3; -inline internal::InternalOpPtr CreateMlaPreprocessOpWithFormat(const internal::InputsImmutableInfoList &inputs, +inline DynamicInternalOpPtr CreateMlaPreprocessOpWithFormat(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const internal::MlaPreprocessParam ¶m) { auto inputs_clone = inputs; @@ -73,7 +74,7 @@ inline internal::InternalOpPtr CreateMlaPreprocessOpWithFormat(const internal::I inputs_clone[kMlaPreprocessKeyCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); inputs_clone[kMlaPreprocessKropeCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); } - return internal::CreateMlaPreprocessOp(inputs_clone, outputs, param, internal::kInternalMlaPreprocessOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMlaPreprocessOp, MlaPreprocessParam, inputs_clone, outputs, param, internal::kInternalMlaPreprocessOpName); }; diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc index 7ea246fd9..5013ff218 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc @@ -69,7 +69,7 @@ public: kMlaPreprocessOutputQropeIndex, kMlaPreprocessOutputKropeIndex}; } protected: - internal::InternalOpPtr + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc index fc37c4e75..e85a7daf1 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc @@ -29,7 +29,7 @@ public: void SetParamCacheMode(const int32_t &cache_mode) { this->cache_mode_ = cache_mode; } internal::MlaPreprocessParam param_; protected: - internal::InternalOpPtr + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { return CreateMlaPreprocessOpWithFormat(inputs, outputs, param_); diff --git a/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc b/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc index 9874751de..956515632 100644 --- a/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc +++ b/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc @@ -24,6 +24,7 @@ #include #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "ccsrc/utils/utils.h" #include "mindspore/core/include/mindapi/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" @@ -101,7 +102,7 @@ class CustomMoeGatingGroupTopK : public InternalKernelMod { } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { @@ -143,7 +144,7 @@ class CustomMoeGatingGroupTopK : public InternalKernelMod { << TypeIdToString(out_flag->dtype_id()) << ", " << TypeIdToString(routed_scaling_factor->dtype_id()) << ", " << TypeIdToString(eps->dtype_id()) << "]"; } - return internal::CreateMoeGatingGroupTopKOp(inputs, outputs, param, internal::kInternalMoeGatingGroupTopKOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param, internal::kInternalMoeGatingGroupTopKOpName); } }; } // namespace ms_custom_ops @@ -178,9 +179,9 @@ class MoeGatingGroupTopKRunner : public InternalPyboostRunner { } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { - return internal::CreateMoeGatingGroupTopKOp(inputs, outputs, param_, internal::kInternalMoeGatingGroupTopKOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param_, internal::kInternalMoeGatingGroupTopKOpName); } private: diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h index fb1dfc5eb..a46e71f7c 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h @@ -18,6 +18,7 @@ #define __MS_CUSTOM_OPS_CCSRC_OPS_MS_KERNELS_INTERNAL_PAGED_CACHE_LOAD_H__ #include +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" namespace ms_custom_ops { enum PagedCacheLoadInputIndex : size_t { @@ -40,16 +41,18 @@ enum PagedCacheLoadOutputIndex : size_t { kPCLOutputsNum }; -inline internal::InternalOpPtr CreatePagedCacheLoadOpWithFormat(const internal::InputsImmutableInfoList &inputs, +inline DynamicInternalOpPtr CreatePagedCacheLoadOpWithFormat(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const internal::PagedCacheLoadParam ¶m) { if (param.kv_cache_cfg_type == 1) { auto inputs_clone = inputs; inputs_clone[kPCLInputKeyCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); inputs_clone[kPCLInputValueCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); - return internal::CreatePagedCacheLoadOp(inputs_clone, outputs, param, internal::kInternalPagedCacheLoadOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreatePagedCacheLoadOp, PagedCacheLoadParam, inputs_clone, outputs, param, + internal::kInternalPagedCacheLoadOpName); } - return internal::CreatePagedCacheLoadOp(inputs, outputs, param, internal::kInternalPagedCacheLoadOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreatePagedCacheLoadOp, PagedCacheLoadParam, inputs, outputs, param, + internal::kInternalPagedCacheLoadOpName); }; } // namespace ms_custom_ops #endif diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc index ff915949f..4eda56fff 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc @@ -80,7 +80,7 @@ public: } protected: - internal::InternalOpPtr + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc index d35081470..c989c8ded 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc @@ -33,7 +33,7 @@ public: void SetHasSeqStarts(const bool &has_seq_starts) { this->has_seq_starts_ = has_seq_starts; } internal::PagedCacheLoadParam param_; protected: - internal::InternalOpPtr + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { return CreatePagedCacheLoadOpWithFormat(inputs, outputs, param_); diff --git a/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc b/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc index 7898dc00a..cb0c2a56e 100644 --- a/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc +++ b/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc @@ -20,6 +20,7 @@ #include #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" +#include "ccsrc/base/ms_kernels_internal/internal_kernels_loader.h" #include "ccsrc/utils/utils.h" #include "mindspore/core/include/mindapi/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" @@ -51,7 +52,7 @@ enum class InputIndex : size_t { enum class OutputIndex : size_t { kOutputIndex = 0 }; -inline internal::InternalOpPtr CreateReshapeAndCacheOpWithFormat(const internal::InputsImmutableInfoList &inputs, +inline DynamicInternalOpPtr CreateReshapeAndCacheOpWithFormat(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const internal::ReshapeAndCacheParam ¶m, int32_t cache_mode) { @@ -59,10 +60,11 @@ inline internal::InternalOpPtr CreateReshapeAndCacheOpWithFormat(const internal: auto inputs_clone = inputs; inputs_clone[static_cast(InputIndex::kInputKeyCacheIndex)].SetFormat(internal::kFormatFRACTAL_NZ); inputs_clone[static_cast(InputIndex::kInputValueCacheIndex)].SetFormat(internal::kFormatFRACTAL_NZ); - return internal::CreateAsdReshapeAndCacheOp(inputs_clone, outputs, param, - internal::kInternalAsdReshapeAndCacheOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, + inputs_clone, outputs, param, internal::kInternalAsdReshapeAndCacheOpName); } - return internal::CreateAsdReshapeAndCacheOp(inputs, outputs, param, internal::kInternalAsdReshapeAndCacheOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, + inputs, outputs, param, internal::kInternalAsdReshapeAndCacheOpName); } // ============================================================================= @@ -126,7 +128,7 @@ class CustomReshapeAndCache : public InternalKernelMod { } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { @@ -172,7 +174,7 @@ class ReshapeAndCacheRunner : public InternalPyboostRunner { void SetCacheMode(const int32_t &cache_mode) { this->cache_mode_ = cache_mode; } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { internal::ReshapeAndCacheParam param; param.head_num = this->head_num_; diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc index d74dfb000..dc4b00805 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc @@ -15,6 +15,7 @@ */ #include "ring_mla.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" namespace ms_custom_ops { @@ -219,7 +220,7 @@ static bool GetSeqLenFromInputAndCheckUpdate(const std::string &kernel_name, con return true; } -internal::InternalOpPtr CustomRingMLA::CreateKernel(const internal::InputsImmutableInfoList &inputs_ii, +DynamicInternalOpPtr CustomRingMLA::CreateKernel(const internal::InputsImmutableInfoList &inputs_ii, const internal::OutputsImmutableInfoList &outputs_ii, const std::vector &ms_inputs, const std::vector &ms_outputs) { @@ -242,7 +243,7 @@ internal::InternalOpPtr CustomRingMLA::CreateKernel(const internal::InputsImmuta "parameters, kernel_name: ", kernel_name_)); created_flag_ = true; - return internal::CreateRingMLAOp(inputs_ii, outputs_ii, param_, internal::kInternalRingMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs_ii, outputs_ii, param_, internal::kInternalRingMLAOpName); } bool CustomRingMLA::UpdateParam(const std::vector &inputs, diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h index 2d8ef4a7e..c399489c7 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h @@ -107,7 +107,7 @@ class CustomRingMLA : public InternalKernelMod { void InitKernelInputsOutputsIndex() override; protected: - internal::InternalOpPtr CreateKernel( + DynamicInternalOpPtr CreateKernel( const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc index 03c4d2f8d..9c06fade7 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc @@ -15,6 +15,7 @@ */ #include "ring_mla_runner.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "ccsrc/utils/utils.h" using namespace ms_custom_ops; @@ -77,10 +78,10 @@ bool RingMLARunner::UpdateParam() { return true; } -internal::InternalOpPtr RingMLARunner::CreateKernel(const internal::InputsImmutableInfoList &inputs, +DynamicInternalOpPtr RingMLARunner::CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) { created_flag_ = true; - return internal::CreateRingMLAOp(inputs, outputs, param_, internal::kInternalRingMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs, outputs, param_, internal::kInternalRingMLAOpName); } namespace { diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h index 8dac1bcb0..89d96d9ed 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h @@ -43,7 +43,7 @@ class RingMLARunner : public InternalPyboostRunner { protected: bool UpdateParam() override; - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override; private: diff --git a/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc b/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc index a5d6f6fbe..942256176 100644 --- a/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc +++ b/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc @@ -20,6 +20,7 @@ #include #include "ccsrc/utils/utils.h" #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" +#include "ccsrc/base/ms_kernels_internal/dynamic_op_helper.h" #include "mindspore/core/include/mindapi/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" #include "mindspore/ops/ops_utils/op_utils.h" @@ -46,7 +47,7 @@ enum class InputIndex : size_t { enum class OutputIndex : size_t { kOutputIndex = 0 }; -inline internal::InternalOpPtr CreateTransDataOpWithParam(const internal::InputsImmutableInfoList &inputs, +inline DynamicInternalOpPtr CreateTransDataOpWithParam(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, int32_t transdata_type) { internal::TransDataParam param; @@ -73,8 +74,9 @@ inline internal::InternalOpPtr CreateTransDataOpWithParam(const internal::Inputs // Note: outCrops are handled internally by the ms_kernels_internal layer // Users do not need to specify outCrops - they are auto-calculated param.specialTransdata = internal::TransDataParam::NORMAL; - - return internal::CreateTransDataOp(inputs_clone, outputs_clone, param, internal::kInternalTransDataOpName); + + return CALL_DYNAMIC_OP_INTERNAL(CreateTransDataOp, TransDataParam, inputs_clone, outputs_clone, param, + internal::kInternalTransDataOpName); } // ============================================================================= @@ -135,7 +137,7 @@ class CustomTransData : public InternalKernelMod { } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { @@ -172,7 +174,7 @@ class TransDataRunner : public InternalPyboostRunner { void SetTransdataType(const int32_t &transdata_type) { this->transdata_type_ = transdata_type; } protected: - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { return CreateTransDataOpWithParam(inputs, outputs, this->transdata_type_); } diff --git a/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc b/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc index f7b05922b..2dd552436 100644 --- a/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc +++ b/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc @@ -130,7 +130,7 @@ public: protected: size_t CalcWorkspace() override { return 0; } - internal::InternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, + DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, const internal::OutputsImmutableInfoList &outputs) override { return nullptr; } diff --git a/cmake/find_ms_internal_kernels_lib.cmake b/cmake/find_ms_internal_kernels_lib.cmake index 76a479e18..05f8e9216 100644 --- a/cmake/find_ms_internal_kernels_lib.cmake +++ b/cmake/find_ms_internal_kernels_lib.cmake @@ -1,106 +1,71 @@ # ============================================================================= -# Find MindSpore Internal Kernels Library +# Find MindSpore Internal Kernels Library (Dynamic Loading Configuration) # ============================================================================= -# Find Python to get MindSpore installation path +# Find Python for potential runtime detection find_package(Python3 COMPONENTS Interpreter REQUIRED) -# Allow user to override MindSpore path +# ============================================================================= +# Dynamic Loading Configuration +# ============================================================================= + +# For dynamic loading, we don't need to find MindSpore at compile time +# The library will be discovered and loaded at runtime using dlopen + +# Set up MindSpore paths - try multiple sources +set(MS_PATH "") +set(INTERNAL_KERNEL_INC_PATH "") + +# Try user-provided MINDSPORE_PATH first if(DEFINED ENV{MINDSPORE_PATH}) set(MS_PATH $ENV{MINDSPORE_PATH}) - message(STATUS "Using MINDSPORE_PATH environment variable: ${MS_PATH}") + message(STATUS "Using user-provided MINDSPORE_PATH: ${MS_PATH}") else() - # Get MindSpore installation path using Python - get the last line of output + # Auto-detect MindSpore installation path via Python execute_process( - COMMAND ${Python3_EXECUTABLE} -c "import mindspore as ms; print(ms.__file__)" - OUTPUT_VARIABLE MS_MODULE_PATH_RAW + COMMAND ${Python3_EXECUTABLE} -c "import mindspore; print(mindspore.__path__[0])" + OUTPUT_VARIABLE PYTHON_MINDSPORE_PATH OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE PYTHON_RESULT - ERROR_VARIABLE PYTHON_ERROR + ERROR_QUIET ) + if(PYTHON_MINDSPORE_PATH) + set(MS_PATH "${PYTHON_MINDSPORE_PATH}") + message(STATUS "Auto-detected MindSpore path via Python: ${MS_PATH}") + endif() +endif() + +# Set up include paths if MindSpore path is available +if(MS_PATH AND NOT MS_PATH STREQUAL "") + set(INTERNAL_KERNEL_ROOT_PATH "${MS_PATH}/lib/plugin/ascend/ms_kernels_internal/internal_kernel") + set(INTERNAL_KERNEL_LIB_PATH "${MS_PATH}/lib/plugin/ascend") + set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_ROOT_PATH}" "${INTERNAL_KERNEL_ROOT_PATH}/include") - # Extract the last non-empty line which should be the MindSpore path - string(REPLACE "\n" ";" OUTPUT_LINES "${MS_MODULE_PATH_RAW}") - - # Find the last non-empty line - set(MS_MODULE_PATH "") - foreach(LINE ${OUTPUT_LINES}) - string(STRIP "${LINE}" STRIPPED_LINE) - if(NOT STRIPPED_LINE STREQUAL "") - set(MS_MODULE_PATH "${STRIPPED_LINE}") + # Check if paths exist (non-fatal for dynamic loading) + foreach(INCLUDE_PATH ${INTERNAL_KERNEL_INC_PATH}) + if(NOT EXISTS ${INCLUDE_PATH}) + message(WARNING "Include path does not exist: ${INCLUDE_PATH}") endif() endforeach() - # Debug: Show the raw output and extracted path - string(LENGTH "${MS_MODULE_PATH_RAW}" RAW_LENGTH) - message(STATUS "Raw Python output length: ${RAW_LENGTH}") - list(LENGTH OUTPUT_LINES NUM_LINES) - message(STATUS "Number of output lines: ${NUM_LINES}") - message(STATUS "Extracted MindSpore path: ${MS_MODULE_PATH}") - - # Validate the result - if(NOT PYTHON_RESULT EQUAL 0) - message(FATAL_ERROR "Failed to find MindSpore installation: ${PYTHON_ERROR}") - endif() - - if(NOT MS_MODULE_PATH MATCHES ".*mindspore.*") - message(FATAL_ERROR "Invalid MindSpore path detected: ${MS_MODULE_PATH}") - endif() - - if(NOT PYTHON_RESULT EQUAL 0) - message(FATAL_ERROR "Failed to find MindSpore installation. Please ensure MindSpore is installed or set MINDSPORE_PATH environment variable.") - endif() - - # Extract directory from MindSpore module path - get_filename_component(MS_PATH ${MS_MODULE_PATH} DIRECTORY) -endif() - -# ============================================================================= -# MindSpore Path Detection -# ============================================================================= - -if(NOT DEFINED MS_PATH) - message(FATAL_ERROR "MS_PATH is not defined. Make sure find_lib.cmake is included in the parent CMakeLists.txt") + message(STATUS "INTERNAL_KERNEL_INC_PATH: ${INTERNAL_KERNEL_INC_PATH}") +else() + # No compile-time MindSpore dependency - pure dynamic loading + message(STATUS "Using pure dynamic loading mode - no compile-time MindSpore dependency") endif() # ============================================================================= -# MindSpore Internal Kernels Path Detection -# ============================================================================= - -set(INTERNAL_KERNEL_ROOT_PATH "${MS_PATH}/lib/plugin/ascend/ms_kernels_internal/internal_kernel") -set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_ROOT_PATH}" "${INTERNAL_KERNEL_ROOT_PATH}/include") - -# Check if paths exist -foreach(INCLUDE_PATH ${INTERNAL_KERNEL_INC_PATH}) - if(NOT EXISTS ${INCLUDE_PATH}) - message(WARNING "Include path does not exist: ${INCLUDE_PATH}") - message(WARNING "This may cause compilation errors if headers are needed") - endif() -endforeach() - -message(STATUS "INTERNAL_KERNEL_INC_PATH: ${INTERNAL_KERNEL_INC_PATH}") - -# ============================================================================= -# Library Detection +# Runtime Loading Configuration # ============================================================================= -set(INTERNAL_KERNEL_LIB_PATH "${MS_PATH}/lib/plugin/ascend") -message(STATUS "INTERNAL_KERNEL_LIB_PATH: ${INTERNAL_KERNEL_LIB_PATH}") +# Note: Library will be discovered at runtime using multiple strategies: +# 1. Standard system library paths (LD_LIBRARY_PATH) +# 2. Runtime MindSpore path detection via Python +# 3. Environment variables (MINDSPORE_PATH) +# 4. Common installation locations -# Check for mindspore_internal_kernels library -find_library(MINDSPORE_INTERNAL_KERNELS_LIB - NAMES mindspore_internal_kernels - PATHS ${INTERNAL_KERNEL_LIB_PATH} - NO_DEFAULT_PATH -) +# Make variables available to parent scope +set(INTERNAL_KERNEL_LIB_PATH "" PARENT_SCOPE) # Empty - runtime detection +set(MINDSPORE_INTERNAL_KERNELS_LIB "" PARENT_SCOPE) +set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_INC_PATH}" PARENT_SCOPE) -if(NOT EXISTS ${MINDSPORE_INTERNAL_KERNELS_LIB}) - message(FATAL_ERROR "Internal kernel library path does not exist: ${MINDSPORE_INTERNAL_KERNELS_LIB}") -endif() - -set(MINDSPORE_INTERNAL_KERNELS_LIB mindspore_internal_kernels) - -if(MINDSPORE_INTERNAL_KERNELS_LIB) - message(STATUS "Found mindspore_internal_kernels library: ${MINDSPORE_INTERNAL_KERNELS_LIB}") - set(MINDSPORE_INTERNAL_KERNELS_LIB "mindspore_internal_kernels" PARENT_SCOPE) -endif() +message(STATUS "Dynamic loading configured - library will be found at runtime") diff --git a/python/ms_custom_ops/__init__.py b/python/ms_custom_ops/__init__.py index b4094d88d..088104c8c 100644 --- a/python/ms_custom_ops/__init__.py +++ b/python/ms_custom_ops/__init__.py @@ -17,9 +17,15 @@ def _init_env(): if os.getenv("ASDOPS_LOG_TO_STDOUT") is None: os.environ["ASDOPS_LOG_TO_STDOUT"] = "1" - ms_path = os.path.dirname(os.path.abspath(mindspore.__file__)) - internal_lib_path = os.path.join(ms_path, "lib", "plugin", "ascend", "libmindspore_internal_kernels.so") - ctypes.CDLL(internal_lib_path) + # 移除直接加载内部库,改为延迟加载 + # C++层的动态加载器会在首次使用时自动加载库 + +def _get_load_status(): + """获取动态加载状态,用于测试延迟初始化是否工作""" + return { + "loaded": False, # Python层不再主动加载,由C++层在首次使用时加载 + "message": "Using dynamic loading - library will be loaded on first use" + } _init_env() @@ -36,7 +42,7 @@ except ImportError: pass # Generated files may not exist during development # Expose generated interfaces -__all__ = [] +__all__ = ["_get_load_status"] # Add ops from gen_ops_def if available try: -- Gitee From 4b494dcad7ecb423c1329e0e50f64055508dfc6c Mon Sep 17 00:00:00 2001 From: mengyuanli Date: Tue, 9 Sep 2025 21:51:16 +0800 Subject: [PATCH 2/3] use factory fit --- .commit_id | 44 +++ ccsrc/CMakeLists.txt | 25 +- ccsrc/adapter/CMakeLists.txt | 54 ++++ ccsrc/adapter/internal_adapter_factory.cc | 273 +++++++++++++++++ ccsrc/adapter/internal_adapter_factory.h | 181 +++++++++++ .../README_dynamic_loading.md | 188 ------------ .../ms_kernels_internal/adapter_loader.cc | 286 ++++++++++++++++++ .../base/ms_kernels_internal/adapter_loader.h | 142 +++++++++ .../ms_kernels_internal/dynamic_op_helper.cc | 212 ------------- .../ms_kernels_internal/dynamic_op_helper.h | 67 +--- .../graphmode/internal_kernel_mod.cc | 40 +-- .../graphmode/internal_kernel_mod.h | 16 +- .../ms_kernels_internal/internal_helper.cc | 58 ++-- .../ms_kernels_internal/internal_helper.h | 8 +- .../internal_kernels_loader.cc | 278 ----------------- .../internal_kernels_loader.h | 155 ---------- .../internal_tiling_cache.h | 4 +- .../pyboost/internal_pyboost_runner.cc | 44 ++- .../pyboost/internal_pyboost_runner.h | 38 +-- .../apply_rotary_pos_emb.cc | 16 +- .../ops/ms_kernels_internal/mla/mla_graph.cc | 20 +- .../ms_kernels_internal/mla/mla_pynative.cc | 14 +- .../mla_preprocess/mla_preprocess_common.h | 16 +- .../mla_preprocess/mla_preprocess_graph.cc | 6 +- .../mla_preprocess/mla_preprocess_pynative.cc | 6 +- .../moe_gating_group_topk.cc | 16 +- .../paged_cache_load_common.h | 14 +- .../paged_cache_load_graph.cc | 6 +- .../paged_cache_load_pynative.cc | 6 +- .../reshape_and_cache/reshape_and_cache.cc | 40 +-- .../ms_kernels_internal/ring_mla/ring_mla.cc | 30 +- .../ms_kernels_internal/ring_mla/ring_mla.h | 8 +- .../ring_mla/ring_mla_runner.cc | 14 +- .../ring_mla/ring_mla_runner.h | 6 +- .../trans_data/trans_data.cc | 30 +- .../type_cast/type_cast.cc | 4 +- cmake/find_ms_internal_kernels_lib.cmake | 29 +- 37 files changed, 1264 insertions(+), 1130 deletions(-) create mode 100644 ccsrc/adapter/CMakeLists.txt create mode 100644 ccsrc/adapter/internal_adapter_factory.cc create mode 100644 ccsrc/adapter/internal_adapter_factory.h delete mode 100644 ccsrc/base/ms_kernels_internal/README_dynamic_loading.md create mode 100644 ccsrc/base/ms_kernels_internal/adapter_loader.cc create mode 100644 ccsrc/base/ms_kernels_internal/adapter_loader.h delete mode 100644 ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc delete mode 100644 ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc delete mode 100644 ccsrc/base/ms_kernels_internal/internal_kernels_loader.h diff --git a/.commit_id b/.commit_id index e69de29bb..81ea18bf2 100644 --- a/.commit_id +++ b/.commit_id @@ -0,0 +1,44 @@ +rt_soc +commit f79ddc28 +Author: mengyuanli +Date: Mon Sep 8 14:38:51 2025 +0800 + + dlopen ms_internal_kernels(compile success) + + use multy load + + not link and find ms_kernels_internal + + remove internal_kernels + + operator api fix + + DynamicInternalOp + + fix internal::DeviceAddressPtr to OutputsAddrList + + op_adapter + + createfunc return internal::InternalOpPtr + + go back to DynamicInternalOp, not use internal::InternalOpPtr + + Tiling bug fix + + add log for find ms lib path + + fix import bug + + add log + + use CreateKernel not use CreateInternalOp + + call cpp api; mapping cpp & c api + + replace c wrapper to call internal::InternalOp + + use c wrapper ---and --- call cpp api + + load so when init@graph setup@pynative + + fix tiling and workspace diff --git a/ccsrc/CMakeLists.txt b/ccsrc/CMakeLists.txt index ab5be02d8..0fe5eac94 100644 --- a/ccsrc/CMakeLists.txt +++ b/ccsrc/CMakeLists.txt @@ -28,15 +28,28 @@ else() message(WARNING "INTERNAL_KERNEL_LIB_PATH is not available for subdirectories") endif() +# Debug: Show include paths +if(DEFINED INTERNAL_KERNEL_INC_PATH) + message(STATUS "INTERNAL_KERNEL_INC_PATH is available for subdirectories: ${INTERNAL_KERNEL_INC_PATH}") +else() + message(WARNING "INTERNAL_KERNEL_INC_PATH is not defined") +endif() + +# Set library and source variables for main library +# Main library uses adapter pattern (no direct internal kernel linking) +set(LIB_DIR "") +set(LIBS "dl") # Only link dynamic loading library +set(MS_INC "${MS_PATH}/include") +message(STATUS "MS_INC set to: ${MS_INC}") + add_subdirectory(base) add_subdirectory(ops) +add_subdirectory(adapter) -# Set library and source variables -# Note: We are now using dynamic loading instead of static linking -set(LIB_DIR "") -set(LIBS "") set(SRC_FILES ${BASE_SRC_FILES} ${OPS_SRC_FILES}) -set(INCLUDE_DIRS ${BASE_INCLUDE_DIRS} ${INTERNAL_KERNEL_INC_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/.. "${MS_PATH}/include") +set(INCLUDE_DIRS ${BASE_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${MS_INC} ${INTERNAL_KERNEL_INC_PATH}) + +# Note: INTERNAL_KERNEL_INC_PATH is excluded from main library to avoid compile-time dependencies # ============================================================================= # Debug Output and Validation @@ -112,7 +125,7 @@ ms.ops.CustomOpBuilder( op_doc=${DOC_YAML_STRING}, backend='Ascend', cflags='${CFLAGS_INCLUDES}', - ldflags='', + ldflags='-ldl', build_dir='${BUILD_EXTENSION_DIR}', debug_mode=${ENABLE_DEBUG} ).build() diff --git a/ccsrc/adapter/CMakeLists.txt b/ccsrc/adapter/CMakeLists.txt new file mode 100644 index 000000000..82b5a6d91 --- /dev/null +++ b/ccsrc/adapter/CMakeLists.txt @@ -0,0 +1,54 @@ +# Adapter layer CMakeLists.txt - bridges to internal kernels +cmake_minimum_required(VERSION 3.16) + +# This adapter layer is built as a separate shared library +# that compile-time links to internal kernels + +if(DEFINED INTERNAL_KERNEL_LIB_PATH AND INTERNAL_KERNEL_LIB_PATH) + message(STATUS "Building adapter layer with internal kernel path: ${INTERNAL_KERNEL_LIB_PATH}") + message("-------------") + message("MS_INC: ${MS_INC}") + message("INTERNAL_KERNEL_INC_PATH: ${INTERNAL_KERNEL_INC_PATH}") + message("-------------") + + # Set up include directories + set(ADAPTER_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR} + ${MS_INC} + ${MS_INC}/.. + ${INTERNAL_KERNEL_INC_PATH} + ${CMAKE_CURRENT_SOURCE_DIR}/.. + ) + + # Set up library directories + set(ADAPTER_LIB_DIRS ${INTERNAL_KERNEL_LIB_PATH}) + + # Adapter source files + file(GLOB ADAPTER_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/*.cc" + ) + + # Create the adapter shared library + add_library(ms_custom_ops_adapter SHARED ${ADAPTER_SOURCES}) + + # Set include directories + target_include_directories(ms_custom_ops_adapter PRIVATE ${ADAPTER_INCLUDE_DIRS}) + + # Link to internal kernels library (compile-time linking) + target_link_directories(ms_custom_ops_adapter PRIVATE ${ADAPTER_LIB_DIRS}) + target_link_libraries(ms_custom_ops_adapter PRIVATE ms_kernels_internal) + + # Set RPATH to ensure runtime library discovery + set_target_properties(ms_custom_ops_adapter PROPERTIES + INSTALL_RPATH "${INTERNAL_KERNEL_LIB_PATH}" + BUILD_WITH_INSTALL_RPATH TRUE + ) + + message(STATUS "Adapter layer configured successfully") + message(STATUS "ADAPTER_SOURCES: ${ADAPTER_SOURCES}") + message(STATUS "ADAPTER_INCLUDE_DIRS: ${ADAPTER_INCLUDE_DIRS}") + message(STATUS "ADAPTER_LIB_DIRS: ${ADAPTER_LIB_DIRS}") + +else() + message(FATAL_ERROR "INTERNAL_KERNEL_LIB_PATH not defined - adapter layer will not be built") +endif() \ No newline at end of file diff --git a/ccsrc/adapter/internal_adapter_factory.cc b/ccsrc/adapter/internal_adapter_factory.cc new file mode 100644 index 000000000..a9035f54f --- /dev/null +++ b/ccsrc/adapter/internal_adapter_factory.cc @@ -0,0 +1,273 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "internal_adapter_factory.h" +#include +#include + +namespace ms_custom_ops { + +// InternalOpFactory Implementation +InternalOpFactory& InternalOpFactory::GetInstance() { + static InternalOpFactory instance; + return instance; +} + +void InternalOpFactory::RegisterOpCreator(const std::string& op_name, OpCreator creator) { + std::lock_guard lock(mutex_); + op_creators_[op_name] = creator; + std::cout << "[Adapter] Registered operation: " << op_name << std::endl; +} + +std::shared_ptr InternalOpFactory::CreateOp( + const std::string &op_name, const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const void *param, const std::string &kernel_name) { + std::lock_guard lock(mutex_); + auto it = op_creators_.find(op_name); + if (it == op_creators_.end()) { + std::cerr << "[Adapter] Operation not registered: " << op_name << std::endl; + return nullptr; + } + + try { + return it->second(inputs, outputs, param, kernel_name); + } catch (const std::exception &e) { + std::cerr << "[Adapter] Failed to create operation " << op_name << ": " << e.what() << std::endl; + return nullptr; + } +} + +bool InternalOpFactory::IsRegistered(const std::string& op_name) const { + std::lock_guard lock(mutex_); + return op_creators_.find(op_name) != op_creators_.end(); +} + +void InternalOpFactory::Initialize() { + std::lock_guard lock(mutex_); + if (initialized_) { + return; + } + + RegisterAllOps(); + initialized_ = true; + std::cout << "[Adapter] Factory initialized with " << op_creators_.size() << " operations" << std::endl; +} + +void InternalOpFactory::RegisterAllOps() { + // Register ApplyRotaryPosEmb operation + RegisterOpCreator( + "CreateApplyRotaryPosEmbOp", + [](const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const void *param, + const std::string &kernel_name) -> std::shared_ptr { + const auto *typed_param = static_cast(param); + return mindspore::internal::CreateApplyRotaryPosEmbOp(inputs, outputs, *typed_param, kernel_name); + }); + + // Register MlaPreprocess operation + RegisterOpCreator("CreateMlaPreprocessOp", + [](const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const void *param, + const std::string &kernel_name) -> std::shared_ptr { + const auto *typed_param = static_cast(param); + return mindspore::internal::CreateMlaPreprocessOp(inputs, outputs, *typed_param, kernel_name); + }); + + // TODO: Add more operations as needed +} + +// OpHandleManager Implementation +OpHandleManager& OpHandleManager::GetInstance() { + static OpHandleManager instance; + return instance; +} + +void *OpHandleManager::RegisterOp(std::shared_ptr op) { + std::lock_guard lock(mutex_); + void *handle = reinterpret_cast(next_handle_id_++); + op_registry_[handle] = op; + return handle; +} + +std::shared_ptr OpHandleManager::GetOp(void *handle) { + std::lock_guard lock(mutex_); + auto it = op_registry_.find(handle); + return (it != op_registry_.end()) ? it->second : nullptr; +} + +void OpHandleManager::UnregisterOp(void* handle) { + std::lock_guard lock(mutex_); + op_registry_.erase(handle); +} + +} // namespace ms_custom_ops + +// C Interface Implementation +extern "C" { + +int InitInternalAdapter() { + try { + ms_custom_ops::InternalOpFactory::GetInstance().Initialize(); + std::cout << "[Adapter] Internal adapter initialized successfully" << std::endl; + return 0; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to initialize adapter: " << e.what() << std::endl; + return -1; + } +} + +void* CreateInternalOp( + const char* op_name, + const void* inputs_data, int inputs_size, + const void* outputs_data, int outputs_size, + const void* param_data, int param_size, + const char* kernel_name) { + + if (!op_name || !kernel_name) { + std::cerr << "[Adapter] Invalid parameters: op_name or kernel_name is null" << std::endl; + return nullptr; + } + + try { + // For now, create empty input/output lists + // In a real implementation, you would deserialize inputs_data and outputs_data + mindspore::internal::InputsImmutableInfoList inputs; + mindspore::internal::OutputsImmutableInfoList outputs; + + auto op = ms_custom_ops::InternalOpFactory::GetInstance().CreateOp( + op_name, inputs, outputs, param_data, kernel_name); + + if (!op) { + return nullptr; + } + + return ms_custom_ops::OpHandleManager::GetInstance().RegisterOp(op); + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to create operation " << op_name << ": " << e.what() << std::endl; + return nullptr; + } +} + +int InitOp(void* handle) { + if (!handle) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + auto status = op->Init(); + return (status == mindspore::internal::kInternalOk) ? 0 : -1; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to initialize operation: " << e.what() << std::endl; + return -1; + } +} + +int UpdateOpParam(void* handle, const void* param) { + if (!handle || !param) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + auto status = op->UpdateParam(param); + return (status == mindspore::internal::kInternalOk) ? 0 : -1; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to update parameters: " << e.what() << std::endl; + return -1; + } +} + +int LaunchOp(void* handle, + const void** inputs, int input_count, + const void** outputs, int output_count, + const void** workspace, int workspace_count, + void* stream_ptr, const char* kernel_name) { + + if (!handle || !inputs || !outputs || !kernel_name) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + // Convert C arrays to vectors + mindspore::internal::InputsAddrList input_addrs; + for (int i = 0; i < input_count; ++i) { + input_addrs.push_back(const_cast(inputs[i])); + } + + mindspore::internal::OutputsAddrList output_addrs; + for (int i = 0; i < output_count; ++i) { + output_addrs.push_back(const_cast(outputs[i])); + } + + mindspore::internal::WsAddrList workspace_addrs; + for (int i = 0; i < workspace_count; ++i) { + workspace_addrs.push_back(const_cast(workspace[i])); + } + + auto status = op->Launch(input_addrs, output_addrs, workspace_addrs, stream_ptr, kernel_name); + return (status == mindspore::internal::kInternalOk) ? 0 : -1; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to launch operation: " << e.what() << std::endl; + return -1; + } +} + +int GetWorkspaceSizes(void* handle, size_t* sizes, int max_count) { + if (!handle || !sizes || max_count <= 0) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + auto workspace_sizes = op->GetWorkspaceSize(); + int count = std::min(static_cast(workspace_sizes.size()), max_count); + + for (int i = 0; i < count; ++i) { + sizes[i] = workspace_sizes[i]; + } + + return count; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to get workspace sizes: " << e.what() << std::endl; + return -1; + } +} + +void ReleaseOp(void* handle) { + if (handle) { + ms_custom_ops::OpHandleManager::GetInstance().UnregisterOp(handle); + } +} + +} // extern "C" \ No newline at end of file diff --git a/ccsrc/adapter/internal_adapter_factory.h b/ccsrc/adapter/internal_adapter_factory.h new file mode 100644 index 000000000..3b4a60755 --- /dev/null +++ b/ccsrc/adapter/internal_adapter_factory.h @@ -0,0 +1,181 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MS_CUSTOM_OPS_INTERNAL_ADAPTER_FACTORY_H_ +#define MS_CUSTOM_OPS_INTERNAL_ADAPTER_FACTORY_H_ + +#include +#include +#include +#include +#include + +// Safe to include internal headers in adapter layer +#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" +#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal_op.h" + +namespace ms_custom_ops { + +/** + * @brief Factory for creating internal operations using MindSpore's factory pattern + */ +class InternalOpFactory { +public: + using OpCreator = std::function( + const mindspore::internal::InputsImmutableInfoList &, const mindspore::internal::OutputsImmutableInfoList &, + const void *, const std::string &)>; + + static InternalOpFactory &GetInstance(); + + // Register operation creator + void RegisterOpCreator(const std::string &op_name, OpCreator creator); + + // Create operation instance + std::shared_ptr CreateOp(const std::string &op_name, + const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, + const void *param, const std::string &kernel_name); + + // Check if operation is registered + bool IsRegistered(const std::string &op_name) const; + + // Initialize all internal operations + void Initialize(); + +private: + InternalOpFactory() = default; + ~InternalOpFactory() = default; + + // Non-copyable + InternalOpFactory(const InternalOpFactory&) = delete; + InternalOpFactory& operator=(const InternalOpFactory&) = delete; + + void RegisterAllOps(); + + mutable std::mutex mutex_; + bool initialized_ = false; + std::unordered_map op_creators_; +}; + +/** + * @brief Operation handle manager for C interface + */ +class OpHandleManager { +public: + static OpHandleManager& GetInstance(); + + // Register operation instance and return handle + void *RegisterOp(std::shared_ptr op); + + // Get operation instance from handle + std::shared_ptr GetOp(void *handle); + + // Unregister operation handle + void UnregisterOp(void* handle); + +private: + OpHandleManager() = default; + ~OpHandleManager() = default; + + OpHandleManager(const OpHandleManager&) = delete; + OpHandleManager& operator=(const OpHandleManager&) = delete; + + std::mutex mutex_; + std::unordered_map> op_registry_; + uint64_t next_handle_id_ = 1; +}; + +} // namespace ms_custom_ops + +// C interface for dynamic loading +extern "C" { + +/** + * @brief Initialize the adapter factory + * @return 0 on success, -1 on failure + */ +int InitInternalAdapter(); + +/** + * @brief Create internal operation + * @param op_name Operation name + * @param inputs_data Serialized input info + * @param inputs_size Size of input data + * @param outputs_data Serialized output info + * @param outputs_size Size of output data + * @param param_data Parameter data + * @param param_size Parameter size + * @param kernel_name Kernel name + * @return Operation handle or nullptr on failure + */ +void* CreateInternalOp( + const char* op_name, + const void* inputs_data, int inputs_size, + const void* outputs_data, int outputs_size, + const void* param_data, int param_size, + const char* kernel_name); + +/** + * @brief Initialize operation + * @param handle Operation handle + * @return 0 on success, -1 on failure + */ +int InitOp(void* handle); + +/** + * @brief Update operation parameters + * @param handle Operation handle + * @param param Parameter data + * @return 0 on success, -1 on failure + */ +int UpdateOpParam(void* handle, const void* param); + +/** + * @brief Launch operation + * @param handle Operation handle + * @param inputs Input addresses + * @param input_count Number of inputs + * @param outputs Output addresses + * @param output_count Number of outputs + * @param workspace Workspace addresses + * @param workspace_count Number of workspaces + * @param stream_ptr Stream pointer + * @param kernel_name Kernel name + * @return 0 on success, -1 on failure + */ +int LaunchOp(void* handle, + const void** inputs, int input_count, + const void** outputs, int output_count, + const void** workspace, int workspace_count, + void* stream_ptr, const char* kernel_name); + +/** + * @brief Get workspace sizes + * @param handle Operation handle + * @param sizes Output array for sizes + * @param max_count Maximum number of sizes + * @return Number of sizes returned + */ +int GetWorkspaceSizes(void* handle, size_t* sizes, int max_count); + +/** + * @brief Release operation + * @param handle Operation handle + */ +void ReleaseOp(void* handle); + +} // extern "C" + +#endif // MS_CUSTOM_OPS_INTERNAL_ADAPTER_FACTORY_H_ \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md b/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md deleted file mode 100644 index 7846183ca..000000000 --- a/ccsrc/base/ms_kernels_internal/README_dynamic_loading.md +++ /dev/null @@ -1,188 +0,0 @@ -# ms_kernels_internal 动态加载功能 - -本文档介绍了如何使用dlopen动态加载ms_kernels_internal库,分别在图模式和Pyboost模式中进行调用。 - -## 概述 - -为了实现单例延迟初始化需求,我们实现了使用dlopen动态加载ms_kernels_internal库的功能。这种实现方式有以下优点: - -1. **延迟加载**:库只在需要时才被加载,减少了启动时间和内存占用 -2. **灵活性**:可以在运行时决定是否加载库,而不是在编译时静态链接 -3. **精简设计**:直接在原有类上修改,不引入新的基类,实现更加简洁 - -## 架构设计 - -### 核心组件 - -1. **InternalKernelsLoader**:负责使用dlopen动态加载ms_kernels_internal库,并获取所需的函数指针 -2. **InternalKernelMod**:修改后的图模式内核模块,直接使用动态加载功能 -3. **InternalPyboostRunner**:修改后的Pyboost模式运行器,直接使用动态加载功能 - -### 类图 - -``` -+------------------------+ +------------------------+ +-------------------------+ -| InternalKernelsLoader | | InternalKernelMod | | InternalPyboostRunner | -+------------------------+ +------------------------+ +-------------------------+ -| - library_handle_ | | - internal_op_ | | - internal_op_ | -| - is_loaded_ | | - internal_inputs_shape_| | - op_key_ | -+------------------------+ | - internal_outputs_shape| | - tiling_key_ | -| + Initialize() | +------------------------+ | - hash_map_ | -| + LoadLibrary() | | + Init() | +-------------------------+ -| + GetFunctionPointer() | | + Resize() | | + Setup() | -| + CreateInternalOp() | | + Launch() | | + GetOrCreateKernel() | -| + Calc...Hash() | +------------------------+ | + LaunchKernel() | -+------------------------+ +-------------------------+ - ^ ^ - | | - +----------------------------------------------------------------+ -``` - -## 使用方法 - -### 1. 初始化动态加载 - -在使用动态加载功能之前,需要先初始化InternalKernelsLoader: - -```cpp -#include "internal_kernels_loader.h" - -// 获取单例 -auto &loader = InternalKernelsLoader::GetInstance(); - -// 初始化动态加载 -if (!loader.Initialize()) { - MS_LOG(ERROR) << "Failed to initialize dynamic loader"; - return -1; -} -``` - -### 2. 图模式下的使用 - -使用修改后的InternalKernelMod类: - -```cpp -#include "internal_kernel_mod.h" - -// 创建内核模块 -auto kernel_mod = std::make_shared(); - -// 正常使用内核模块 -std::vector inputs = ...; -std::vector outputs = ...; - -// 初始化 -if (!kernel_mod->Init(inputs, outputs)) { - MS_LOG(ERROR) << "Failed to initialize kernel"; - return -1; -} - -// 调整大小 -if (kernel_mod->Resize(inputs, outputs) != KRET_OK) { - MS_LOG(ERROR) << "Failed to resize kernel"; - return -1; -} - -// 启动内核 -std::vector workspace = ...; -void *stream_ptr = ...; -if (!kernel_mod->Launch(inputs, workspace, outputs, stream_ptr)) { - MS_LOG(ERROR) << "Failed to launch kernel"; - return -1; -} -``` - -### 3. Pyboost模式下的使用 - -使用修改后的InternalPyboostRunner类: - -```cpp -#include "internal_pyboost_runner.h" - -// 创建运行器 -auto runner = std::make_shared(); - -// 设置操作名称和参数 -std::string op_name = "your_op_name"; -runner->Setup(op_name, args...); - -// 准备输入输出张量 -TensorList inputs = ...; -TensorList outputs = ...; - -// 获取或创建内核 -runner->GetOrCreateKernel(inputs, outputs); - -// 启动内核 -runner->LaunchKernel(); -``` - -## 示例代码 - -完整的示例代码请参考 `dynamic_loading_example.cc` 文件。 - -## 编译和运行 - -### 编译要求 - -1. 支持C++17标准的编译器 -2. 支持dlopen的操作系统(如Linux) -3. ms_kernels_internal库的动态链接库文件(libmindspore_internal_kernels.so) - -### 编译步骤 - -1. 确保ms_kernels_internal库的动态链接库文件在系统路径中,或设置LD_LIBRARY_PATH环境变量 - -```bash -export LD_LIBRARY_PATH=/path/to/ms_kernels_internal/lib:$LD_LIBRARY_PATH -``` - -2. 编译示例代码 - -```bash -cd /home/lmy/custom_op/akg/ccsrc/base/ms_kernels_internal -g++ -std=c++17 -I./include -L./lib -o dynamic_loading_example dynamic_loading_example.cc -ldl -lmindspore_internal_kernels -``` - -3. 运行示例 - -```bash -./dynamic_loading_example -``` - -## 注意事项 - -1. **库路径**:确保ms_kernels_internal库的动态链接库文件在系统路径中,或通过LD_LIBRARY_PATH环境变量指定 -2. **线程安全**:SimpleDynamicLoader是线程安全的,可以在多线程环境中使用 -3. **错误处理**:在使用动态加载功能时,务必检查返回值 -4. **资源释放**:动态加载的库会在程序退出时自动释放 - -## 故障排除 - -### 常见错误 - -1. "Failed to load library: ..." - - 检查库文件是否存在 - - 检查LD_LIBRARY_PATH环境变量是否正确设置 - - 检查库文件的权限 - -2. "Failed to get function pointer: ..." - - 检查库文件版本是否正确 - - 检查函数名称是否正确 - -3. "Function call failed: ..." - - 检查函数参数是否正确 - - 检查库文件是否正确初始化 - -### 调试方法 - -1. 检查库加载状态 - -```cpp -auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); -MS_LOG(INFO) << "Library loaded: " << loader.IsInitialized(); -``` - -## 总结 - -通过使用dlopen动态加载ms_kernels_internal库,我们实现了单例延迟初始化需求,提供了更灵活的使用方式。这种实现方式直接在原有类上修改,不引入新的基类,实现更加简洁,符合精简版的设计要求。 \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/adapter_loader.cc b/ccsrc/base/ms_kernels_internal/adapter_loader.cc new file mode 100644 index 000000000..56491bd26 --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/adapter_loader.cc @@ -0,0 +1,286 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "adapter_loader.h" +#include +#include +#include +#include "mindspore/core/include/utils/log_adapter.h" + +namespace ms_custom_ops { + +AdapterLoader& AdapterLoader::GetInstance() { + static AdapterLoader instance; + return instance; +} + +AdapterLoader::~AdapterLoader() { + Cleanup(); +} + +bool AdapterLoader::Initialize() { + std::lock_guard lock(mutex_); + + if (loaded_) { + return true; + } + + MS_LOG(INFO) << "Initializing adapter loader with factory pattern"; + + if (!LoadAdapter()) { + MS_LOG(ERROR) << "Failed to load adapter library"; + return false; + } + + // Initialize the adapter factory + if (init_adapter_func_ && init_adapter_func_() != 0) { + MS_LOG(ERROR) << "Failed to initialize adapter factory"; + return false; + } + + loaded_ = true; + MS_LOG(INFO) << "Adapter loader initialized successfully"; + return true; +} + +bool AdapterLoader::LoadAdapter() { + // Find adapter library + std::vector search_paths = { + "./ms_custom_ops_adapter.so", + "../build/ms_custom_ops_adapter.so", + std::string(std::getenv("HOME") ? std::getenv("HOME") : ".") + "/.local/lib/ms_custom_ops_adapter.so" + }; + + std::string adapter_path; + for (const auto& path : search_paths) { + struct stat buffer; + if (stat(path.c_str(), &buffer) == 0) { + adapter_path = path; + break; + } + } + + if (adapter_path.empty()) { + MS_LOG(ERROR) << "Cannot find ms_custom_ops_adapter.so"; + return false; + } + + MS_LOG(INFO) << "Loading adapter from: " << adapter_path; + + // Load adapter library + adapter_handle_ = dlopen(adapter_path.c_str(), RTLD_LAZY); + if (!adapter_handle_) { + MS_LOG(ERROR) << "Failed to load adapter library: " << dlerror(); + return false; + } + + // Load function symbols + init_adapter_func_ = reinterpret_cast( + dlsym(adapter_handle_, "InitInternalAdapter")); + create_op_func_ = reinterpret_cast( + dlsym(adapter_handle_, "CreateInternalOp")); + init_op_func_ = reinterpret_cast( + dlsym(adapter_handle_, "InitOp")); + update_param_func_ = reinterpret_cast( + dlsym(adapter_handle_, "UpdateOpParam")); + launch_op_func_ = reinterpret_cast( + dlsym(adapter_handle_, "LaunchOp")); + get_workspace_sizes_func_ = reinterpret_cast( + dlsym(adapter_handle_, "GetWorkspaceSizes")); + get_tiling_size_func_ = reinterpret_cast( + dlsym(adapter_handle_, "GetTilingSize")); + tiling_func_ = reinterpret_cast( + dlsym(adapter_handle_, "Tiling")); + set_tiling_info_func_ = reinterpret_cast( + dlsym(adapter_handle_, "SetTilingInfo")); + release_op_func_ = reinterpret_cast( + dlsym(adapter_handle_, "ReleaseOp")); + + // Check if all functions are loaded + if (!init_adapter_func_ || !create_op_func_ || !init_op_func_ || + !update_param_func_ || !launch_op_func_ || !get_workspace_sizes_func_ || + !get_tiling_size_func_ || !tiling_func_ || !set_tiling_info_func_ || !release_op_func_) { + MS_LOG(ERROR) << "Failed to load all adapter functions"; + return false; + } + + MS_LOG(INFO) << "Adapter library loaded successfully"; + return true; +} + +AdapterInternalOpPtr AdapterLoader::CreateOp( + const std::string& op_name, + const mindspore::internal::InputsImmutableInfoList& inputs, + const mindspore::internal::OutputsImmutableInfoList& outputs, + const void* param, + const std::string& kernel_name) { + + if (!Initialize()) { + MS_LOG(ERROR) << "Adapter not initialized"; + return nullptr; + } + + // Serialize inputs and outputs (simplified for now) + // In a real implementation, you would properly serialize the data structures + const void* inputs_data = nullptr; + int inputs_size = 0; + const void* outputs_data = nullptr; + int outputs_size = 0; + int param_size = 0; // This should be calculated based on the actual parameter type + + void* op_handle = create_op_func_( + op_name.c_str(), + inputs_data, inputs_size, + outputs_data, outputs_size, + param, param_size, + kernel_name.c_str()); + + if (!op_handle) { + MS_LOG(ERROR) << "Failed to create operation: " << op_name; + return nullptr; + } + + MS_LOG(INFO) << "Created operation: " << op_name; + return std::make_shared(op_handle, this); +} + +void AdapterLoader::Cleanup() { + std::lock_guard lock(mutex_); + + if (adapter_handle_) { + dlclose(adapter_handle_); + adapter_handle_ = nullptr; + } + + loaded_ = false; + + // Clear function pointers + init_adapter_func_ = nullptr; + create_op_func_ = nullptr; + init_op_func_ = nullptr; + update_param_func_ = nullptr; + launch_op_func_ = nullptr; + get_workspace_sizes_func_ = nullptr; + get_tiling_size_func_ = nullptr; + tiling_func_ = nullptr; + set_tiling_info_func_ = nullptr; + release_op_func_ = nullptr; +} + +// AdapterInternalOp Implementation +AdapterInternalOp::~AdapterInternalOp() { + if (op_handle_ && loader_ && loader_->release_op_func_) { + loader_->release_op_func_(op_handle_); + } +} + +mindspore::internal::InternalStatus AdapterInternalOp::Init() { + if (!loader_ || !loader_->init_op_func_) { + return static_cast(-1); + } + + int result = loader_->init_op_func_(op_handle_); + return (result == 0) ? mindspore::internal::kInternalOk : static_cast(-1); +} + +mindspore::internal::InternalStatus AdapterInternalOp::UpdateParam(const void* param) { + if (!loader_ || !loader_->update_param_func_) { + return static_cast(-1); + } + + int result = loader_->update_param_func_(op_handle_, param); + return (result == 0) ? mindspore::internal::kInternalOk : static_cast(-1); +} + +mindspore::internal::InternalStatus AdapterInternalOp::UpdateShape( + const mindspore::internal::ShapeInfoList& inputs_shape, + const mindspore::internal::ShapeInfoList& outputs_shape) { + // For now, return success as shape updates may not be required by all operations + // The underlying operation may handle dynamic shapes automatically + return mindspore::internal::kInternalOk; +} + +mindspore::internal::InternalStatus AdapterInternalOp::Launch( + const mindspore::internal::InputsAddrList& inputs, + const mindspore::internal::OutputsAddrList& outputs, + const mindspore::internal::WsAddrList& workspace, + void* stream_ptr, + const std::string& kernel_name) { + + if (!loader_ || !loader_->launch_op_func_) { + return static_cast(-1); + } + + // Convert vectors to C arrays + std::vector input_ptrs(inputs.begin(), inputs.end()); + std::vector output_ptrs(outputs.begin(), outputs.end()); + std::vector workspace_ptrs(workspace.begin(), workspace.end()); + + int result = loader_->launch_op_func_( + op_handle_, + input_ptrs.data(), static_cast(input_ptrs.size()), + output_ptrs.data(), static_cast(output_ptrs.size()), + workspace_ptrs.data(), static_cast(workspace_ptrs.size()), + stream_ptr, + kernel_name.c_str()); + + return (result == 0) ? mindspore::internal::kInternalOk : static_cast(-1); +} + +std::vector AdapterInternalOp::GetWorkspaceSize() { + if (!loader_ || !loader_->get_workspace_sizes_func_) { + return {}; + } + + constexpr int max_workspaces = 16; + size_t sizes[max_workspaces]; + + int count = loader_->get_workspace_sizes_func_(op_handle_, sizes, max_workspaces); + if (count <= 0) { + return {}; + } + + return std::vector(sizes, sizes + count); +} + +size_t AdapterInternalOp::GetTilingSize() { + if (!loader_ || !loader_->get_tiling_size_func_) { + return 0; + } + + return loader_->get_tiling_size_func_(op_handle_); +} + +mindspore::internal::InternalStatus AdapterInternalOp::Tiling(void *host_addr, mindspore::internal::HostRunInfoPtr *host_run_info_ptr) { + if (!loader_ || !loader_->tiling_func_) { + return static_cast(-1); + } + + int result = loader_->tiling_func_(op_handle_, host_addr, reinterpret_cast(host_run_info_ptr)); + return (result == 0) ? mindspore::internal::kInternalOk : static_cast(-1); +} + +void AdapterInternalOp::SetTilingInfo(const std::shared_ptr &tiling_info) { + if (!loader_ || !loader_->set_tiling_info_func_) { + MS_LOG(WARNING) << "SetTilingInfo function not available in adapter"; + return; + } + + // Convert TilingInfo to void pointer for the C interface + loader_->set_tiling_info_func_(op_handle_, static_cast(tiling_info.get())); +} + +} // namespace ms_custom_ops \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/adapter_loader.h b/ccsrc/base/ms_kernels_internal/adapter_loader.h new file mode 100644 index 000000000..9a880e228 --- /dev/null +++ b/ccsrc/base/ms_kernels_internal/adapter_loader.h @@ -0,0 +1,142 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MS_CUSTOM_OPS_ADAPTER_LOADER_H_ +#define MS_CUSTOM_OPS_ADAPTER_LOADER_H_ + +#include +#include +#include +#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" +#include "mindspore/core/include/utils/log_adapter.h" + +namespace ms_custom_ops { + +// Forward declarations +class AdapterInternalOp; +using AdapterInternalOpPtr = std::shared_ptr; + +/** + * @brief Adapter loader using factory pattern instead of symbol mapping + * + * This class replaces the complex dynamic symbol resolution approach + * with MindSpore's proven factory pattern approach. + */ +class AdapterLoader { +public: + static AdapterLoader& GetInstance(); + + // Initialize adapter (lazy loading) + bool Initialize(); + + // Create operation using factory interface + AdapterInternalOpPtr CreateOp(const std::string &op_name, + const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const void *param, + const std::string &kernel_name); + + // Check if adapter is loaded + bool IsLoaded() const { return loaded_; } + + // Cleanup + void Cleanup(); + +private: + AdapterLoader() = default; + ~AdapterLoader(); + + // Non-copyable + AdapterLoader(const AdapterLoader&) = delete; + AdapterLoader& operator=(const AdapterLoader&) = delete; + + // Load adapter library + bool LoadAdapter(); + + // Function pointers for adapter interface + typedef int (*InitAdapterFunc)(); + typedef void* (*CreateOpFunc)(const char*, const void*, int, const void*, int, + const void*, int, const char*); + typedef int (*InitOpFunc)(void*); + typedef int (*UpdateParamFunc)(void*, const void*); + typedef int (*LaunchOpFunc)(void*, const void**, int, const void**, int, + const void**, int, void*, const char*); + typedef int (*GetWorkspaceSizesFunc)(void*, size_t*, int); + typedef size_t (*GetTilingSizeFunc)(void*); + typedef int (*TilingFunc)(void*, void*, void**); + typedef int (*SetTilingInfoFunc)(void*, void*); + typedef void (*ReleaseOpFunc)(void*); + + std::mutex mutex_; + bool loaded_ = false; + void* adapter_handle_ = nullptr; + + // Adapter function pointers + InitAdapterFunc init_adapter_func_ = nullptr; + CreateOpFunc create_op_func_ = nullptr; + InitOpFunc init_op_func_ = nullptr; + UpdateParamFunc update_param_func_ = nullptr; + LaunchOpFunc launch_op_func_ = nullptr; + GetWorkspaceSizesFunc get_workspace_sizes_func_ = nullptr; + GetTilingSizeFunc get_tiling_size_func_ = nullptr; + TilingFunc tiling_func_ = nullptr; + SetTilingInfoFunc set_tiling_info_func_ = nullptr; + ReleaseOpFunc release_op_func_ = nullptr; + + friend class AdapterInternalOp; +}; + +/** + * @brief Wrapper for internal operations using adapter interface + */ +class AdapterInternalOp { +public: + AdapterInternalOp(void* op_handle, AdapterLoader* loader) + : op_handle_(op_handle), loader_(loader) {} + + ~AdapterInternalOp(); + + // Operation interface + mindspore::internal::InternalStatus Init(); + mindspore::internal::InternalStatus UpdateParam(const void *param); + mindspore::internal::InternalStatus UpdateShape(const mindspore::internal::ShapeInfoList &inputs_shape, + const mindspore::internal::ShapeInfoList &outputs_shape); + mindspore::internal::InternalStatus Launch(const mindspore::internal::InputsAddrList &inputs, + const mindspore::internal::OutputsAddrList &outputs, + const mindspore::internal::WsAddrList &workspace, void *stream_ptr, + const std::string &kernel_name); + std::vector GetWorkspaceSize(); + size_t GetTilingSize(); + mindspore::internal::InternalStatus Tiling(void *host_addr, mindspore::internal::HostRunInfoPtr *host_run_info_ptr); + void SetTilingInfo(const std::shared_ptr &tiling_info); + +private: + void* op_handle_; + AdapterLoader* loader_; +}; + +} // namespace ms_custom_ops + +// Convenience macros for operation creation +#define CALL_ADAPTER_OP(FUNC_NAME, PARAM_TYPE, inputs_param, outputs_param, param_obj, op_name) \ + ({ \ + auto& loader = ms_custom_ops::AdapterLoader::GetInstance(); \ + if (!loader.Initialize()) { \ + MS_LOG(EXCEPTION) << "Failed to initialize adapter loader"; \ + } \ + loader.CreateOp(#FUNC_NAME, inputs_param, outputs_param, \ + static_cast(¶m_obj), op_name); \ + }) + +#endif // MS_CUSTOM_OPS_ADAPTER_LOADER_H_ \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc deleted file mode 100644 index bea778b82..000000000 --- a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.cc +++ /dev/null @@ -1,212 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "dynamic_op_helper.h" -#include - -namespace ms_custom_ops { - -// DynamicInternalOp implementation - direct method calls to InternalOp smart pointer - -internal::InternalStatus DynamicInternalOp::Init() { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return internal::kInternalError; - } - - // 使用mangled符号直接调用C++方法,避免编译时依赖 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef internal::InternalStatus (*InitFunc)(void*); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp4InitEv"; - auto init_func = loader.GetOpCreateFunction(mangled_name); - - if (!init_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::Init function via mangled symbol: " << mangled_name; - return internal::kInternalError; - } - - return init_func(op_ptr_.get()); // 传递裸指针给C++方法 -} - -internal::InternalStatus DynamicInternalOp::UpdateParam(const void* param) { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return internal::kInternalError; - } - - // 使用mangled符号直接调用C++方法 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef internal::InternalStatus (*UpdateParamFunc)(void*, const void*); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp11UpdateParamEPKv"; - auto update_func = loader.GetOpCreateFunction(mangled_name); - - if (!update_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::UpdateParam function via mangled symbol: " << mangled_name; - return internal::kInternalError; - } - - return update_func(op_ptr_.get(), param); -} - -internal::InternalStatus DynamicInternalOp::UpdateShape(const internal::ShapeInfoList &inputs_shape, - const internal::ShapeInfoList &outputs_shape) { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return internal::kInternalError; - } - - // 使用mangled符号直接调用C++方法 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef internal::InternalStatus (*UpdateShapeFunc)(void*, const std::vector>&, const std::vector>&); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp11UpdateShapeERKSt6vectorIS2_IlSaIlEESaIS4_EES8_"; - auto update_shape_func = loader.GetOpCreateFunction(mangled_name); - - if (!update_shape_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::UpdateShape function via mangled symbol: " << mangled_name; - return internal::kInternalError; - } - - return update_shape_func(op_ptr_.get(), inputs_shape, outputs_shape); -} - -size_t DynamicInternalOp::GetTilingSize() { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return 0; - } - - auto& loader = InternalKernelsLoader::GetInstance(); - typedef size_t (*GetTilingSizeFunc)(void*); - - // Try to find GetTilingSize method via mangled symbol - std::string mangled_name = "_ZN9mindspore8internal10InternalOp13GetTilingSizeEv"; - auto get_tiling_size_func = loader.GetOpCreateFunction(mangled_name); - - if (get_tiling_size_func) { - return get_tiling_size_func(op_ptr_.get()); - } - - MS_LOG(WARNING) << "GetTilingSize: mangled symbol not found, returning default size 1024"; - return 1024; // Return a reasonable default instead of 0 -} - -internal::InternalStatus DynamicInternalOp::Tiling(internal::RawHostAddr host_ptr, - internal::HostRunInfoPtr *run_info_ptr) { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return internal::kInternalError; - } - - // 使用mangled符号直接调用C++方法 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef internal::InternalStatus (*TilingFunc)(void*, void*, internal::HostRunInfoPtr*); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp6TilingEPvPSt10shared_ptrINS0_11HostRunInfoEE"; - auto tiling_func = loader.GetOpCreateFunction(mangled_name); - - if (!tiling_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::Tiling function via mangled symbol: " << mangled_name; - return internal::kInternalError; - } - - return tiling_func(op_ptr_.get(), host_ptr, run_info_ptr); -} - -void DynamicInternalOp::SetTilingInfo(const internal::TilingInfoPtr& tiling_info) { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return; - } - - // 使用mangled符号直接调用C++方法 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef void (*SetTilingInfoFunc)(void*, const internal::TilingInfoPtr&); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp13SetTilingInfoERKSt10shared_ptrINS0_10TilingInfoEE"; - auto set_tiling_func = loader.GetOpCreateFunction(mangled_name); - - if (!set_tiling_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::SetTilingInfo function via mangled symbol: " << mangled_name; - return; - } - - set_tiling_func(op_ptr_.get(), tiling_info); -} - -std::vector DynamicInternalOp::GetWorkspaceSize() { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return {}; - } - - auto& loader = InternalKernelsLoader::GetInstance(); - typedef std::vector (*GetWorkspaceSizeFunc)(void*); - - // Try to find GetWorkspaceSize method via mangled symbol - std::string mangled_name = "_ZN9mindspore8internal10InternalOp16GetWorkspaceSizeEv"; - auto get_workspace_size_func = loader.GetOpCreateFunction(mangled_name); - - if (get_workspace_size_func) { - return get_workspace_size_func(op_ptr_.get()); - } - - MS_LOG(WARNING) << "GetWorkspaceSize: mangled symbol not found, returning empty vector"; - return {}; -} - -internal::InternalStatus DynamicInternalOp::Launch(const internal::InputsAddrList& inputs, - const internal::OutputsAddrList& outputs, - const internal::WsAddrList& workspace, - void* stream_ptr, const std::string& kernel_name) { - if (!op_ptr_) { - MS_LOG(ERROR) << "InternalOp pointer is null"; - return internal::kInternalError; - } - - // 使用mangled符号直接调用C++方法 - auto& loader = InternalKernelsLoader::GetInstance(); - typedef internal::InternalStatus (*LaunchFunc)(void*, const std::vector&, const std::vector&, - const std::vector&, void*, const std::string&); - std::string mangled_name = "_ZN9mindspore8internal10InternalOp6LaunchERKSt6vectorIPvSaIS3_EES7_S7_S3_RKSs"; - auto launch_func = loader.GetOpCreateFunction(mangled_name); - - if (!launch_func) { - MS_LOG(ERROR) << "Failed to load InternalOp::Launch function via mangled symbol: " << mangled_name; - return internal::kInternalError; - } - - return launch_func(op_ptr_.get(), inputs, outputs, workspace, stream_ptr, kernel_name); -} - -std::string GetMangledSymbolName(const std::string& func_name) { - // Try to find symbol by pattern matching in the loaded library - auto& loader = InternalKernelsLoader::GetInstance(); - - // First try direct name lookup (in case library has extern "C" wrappers) - void* direct_symbol = loader.GetFunctionPointer(func_name); - if (direct_symbol) { - return func_name; - } - - // If direct lookup fails, search for mangled symbols containing the function name - std::string mangled_symbol = loader.FindMangledSymbol(func_name); - if (!mangled_symbol.empty()) { - return mangled_symbol; - } - - MS_LOG(WARNING) << "No symbol found for: " << func_name; - return func_name; -} - -} // namespace ms_custom_ops \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h index 7fbcc0e6e..f39b7a3ba 100644 --- a/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h +++ b/ccsrc/base/ms_kernels_internal/dynamic_op_helper.h @@ -16,70 +16,27 @@ #ifndef MS_CUSTOM_OPS_DYNAMIC_OP_HELPER_H_ #define MS_CUSTOM_OPS_DYNAMIC_OP_HELPER_H_ -#include "internal_kernels_loader.h" -#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" +#include "adapter_loader.h" #include "mindspore/core/include/utils/log_adapter.h" namespace ms_custom_ops { -// Forward declare for wrapper functions -class DynamicInternalOp; -using DynamicInternalOpPtr = std::shared_ptr; +// Use AdapterInternalOp from adapter_loader.h +using DynamicInternalOp = AdapterInternalOp; +using DynamicInternalOpPtr = AdapterInternalOpPtr; /** - * @brief Get mangled symbol name for C++ function - * @param func_name Function name (e.g., "CreateApplyRotaryPosEmbOp") - * @return Mangled symbol name + * @brief Helper macro for dynamic op creation using factory pattern + * This replaces the complex symbol resolution with simple factory calls */ -std::string GetMangledSymbolName(const std::string &func_name); - -/** - * @brief Dynamic wrapper for InternalOp that uses function pointers - */ -class DynamicInternalOp { -public: - DynamicInternalOp(std::shared_ptr op_ptr) : op_ptr_(op_ptr) {} - - // Method wrappers - will be implemented using dynamic function calls - internal::InternalStatus Init(); - internal::InternalStatus UpdateParam(const void* param); - internal::InternalStatus UpdateShape(const internal::ShapeInfoList &inputs_shape, - const internal::ShapeInfoList &outputs_shape); - size_t GetTilingSize(); - internal::InternalStatus Tiling(internal::RawHostAddr host_ptr, internal::HostRunInfoPtr *run_info_ptr); - void SetTilingInfo(const internal::TilingInfoPtr& tiling_info); - std::vector GetWorkspaceSize(); - internal::InternalStatus Launch(const internal::InputsAddrList& inputs, - const internal::OutputsAddrList& outputs, - const internal::WsAddrList& workspace, - void* stream_ptr, const std::string& kernel_name); - - - private: - std::shared_ptr op_ptr_; // Smart pointer to the real InternalOp -}; - -/** - * @brief Helper macro for dynamic op creation - returns DynamicInternalOpPtr - */ -#define CALL_DYNAMIC_OP_INTERNAL(FUNC_NAME, PARAM_TYPE, inputs_param, outputs_param, ...) \ +#define CALL_DYNAMIC_OP_INTERNAL(FUNC_NAME, PARAM_TYPE, inputs_param, outputs_param, param_obj, op_name) \ ({ \ - auto &loader = InternalKernelsLoader::GetInstance(); \ - typedef std::shared_ptr (*FUNC_NAME##_Type)( \ - const internal::InputsImmutableInfoList &, const internal::OutputsImmutableInfoList &, \ - const internal::PARAM_TYPE &, const std::string &); \ - std::string mangled_name = GetMangledSymbolName(#FUNC_NAME); \ - auto create_func = loader.GetOpCreateFunction(mangled_name); \ - DynamicInternalOpPtr result = nullptr; \ - if (!create_func) { \ - MS_LOG(EXCEPTION) << "Failed to load " << #FUNC_NAME << " function with mangled name: " << mangled_name; \ - } else { \ - auto op_ptr = create_func(inputs_param, outputs_param, __VA_ARGS__); \ - if (op_ptr) { \ - result = std::make_shared(op_ptr); \ - } \ + auto &loader = AdapterLoader::GetInstance(); \ + if (!loader.Initialize()) { \ + MS_LOG(EXCEPTION) << "Failed to initialize adapter loader"; \ } \ - result; \ + loader.CreateOp(#FUNC_NAME, inputs_param, outputs_param, \ + static_cast(¶m_obj), op_name); \ }) } // namespace ms_custom_ops diff --git a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc index 431332ac9..89a8bd32b 100644 --- a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc +++ b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.cc @@ -15,7 +15,7 @@ */ #include "internal_kernel_mod.h" -#include "internal_kernels_loader.h" +#include "../adapter_loader.h" #include #include #include "mindspore/core/include/utils/ms_context.h" @@ -28,11 +28,11 @@ SimpleSpinLock InternalKernelMod::lock_ = SimpleSpinLock(); bool InternalKernelMod::Init(const std::vector &inputs, const std::vector &outputs) { // Load internal kernels library early to initialize device environment - auto& loader = InternalKernelsLoader::GetInstance(); - if (!loader.IsInitialized()) { + auto &loader = AdapterLoader::GetInstance(); + if (!loader.IsLoaded()) { MS_LOG(INFO) << "Loading internal kernels library during InternalKernelMod::Init"; if (!loader.Initialize()) { - MS_LOG(ERROR) << "Failed to load internal kernels library in Init: " << loader.GetLastError(); + MS_LOG(ERROR) << "Failed to load internal kernels library in Init: "; return false; } } @@ -48,12 +48,12 @@ bool InternalKernelMod::Init(const std::vector &inputs, const st for (size_t i = 0; i < kernel_inputs_index_.size(); i++) { internal_inputs_addr_.emplace_back(nullptr); - internal_inputs_shape_.emplace_back(internal::ShapeInfo{0}); + internal_inputs_shape_.emplace_back(mindspore::internal::ShapeInfo{0}); } for (size_t i = 0; i < kernel_outputs_index_.size(); i++) { internal_outputs_addr_.emplace_back(nullptr); - internal_outputs_shape_.emplace_back(internal::ShapeInfo{0}); + internal_outputs_shape_.emplace_back(mindspore::internal::ShapeInfo{0}); } for (size_t i = 0; i < inputs.size(); i++) { @@ -166,16 +166,16 @@ void InternalKernelMod::GetOrGenerateTiling(const std::vector &i if (tiling_cache_item == nullptr) { auto tiling_size = internal_op_->GetTilingSize(); auto host_addr = TilingMemMgr::GetInstance().pool_host_.Malloc(tiling_size); - internal::HostRunInfoPtr host_run_info_ptr = nullptr; + mindspore::internal::HostRunInfoPtr host_run_info_ptr = nullptr; auto status = internal_op_->Tiling(host_addr, &host_run_info_ptr); - if (status != internal::kInternalOk || host_run_info_ptr == nullptr) { + if (status != mindspore::internal::kInternalOk || host_run_info_ptr == nullptr) { MS_LOG(EXCEPTION) << "Tiling error for " << kernel_name_ << ", status: " << status << ", host_run_info_ptr: " << host_run_info_ptr; } auto device_addr = TilingMemMgr::GetInstance().pool_device_.Malloc(tiling_size); TilingMemMgr::GetInstance().CopyAsync(host_addr, device_addr, tiling_size); - auto tiling_info = std::make_shared(device_addr, nullptr); + auto tiling_info = std::make_shared(device_addr, nullptr); internal_op_->SetTilingInfo(tiling_info); tiling_info->host_run_info_ = host_run_info_ptr; workspace_size_list_ = internal_op_->GetWorkspaceSize(); @@ -207,14 +207,14 @@ void InternalKernelMod::GetOrGenerateTiling(const std::vector &i void InternalKernelMod::GetInternalKernel(const std::vector &inputs, const std::vector &outputs) { // 使用动态加载的实现 - auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); + auto &loader = ms_custom_ops::AdapterLoader::GetInstance(); if (loader.Initialize()) { if (IsNeedRecreate(inputs, outputs)) { - internal::InputsImmutableInfoList inputs_ii; - internal::OutputsImmutableInfoList outputs_ii; - std::vector input_shapes; - std::vector output_shapes; - + mindspore::internal::InputsImmutableInfoList inputs_ii; + mindspore::internal::OutputsImmutableInfoList outputs_ii; + std::vector input_shapes; + std::vector output_shapes; + for (auto i : kernel_inputs_index_) { auto dtype = TransInternalDataType(inputs[i]->dtype_id()); auto format = TransInternalFormat(inputs[i]->format()); @@ -241,13 +241,13 @@ void InternalKernelMod::GetInternalKernel(const std::vector &inp internal_op_ = CreateKernel(inputs_ii, outputs_ii, inputs, outputs); MS_EXCEPTION_IF_NULL(internal_op_); auto status = internal_op_->Init(); - if (status != internal::kInternalOk) { + if (status != mindspore::internal::kInternalOk) { internal_op_ = nullptr; MS_LOG(ERROR) << "Init InternalKernel failed, kenrel_name: " << kernel_name_; } } } else { - MS_LOG(ERROR) << "Failed to initialize dynamic loader: " << loader.GetLastError(); + MS_LOG(ERROR) << "Failed to initialize dynamic loader: "; } } @@ -297,7 +297,7 @@ int InternalKernelMod::Resize(const std::vector &inputs, const s return KRET_RESIZE_FAILED; } auto internal_ret = internal_op_->UpdateShape(internal_inputs_shape_, internal_outputs_shape_); - if (internal_ret != internal::kInternalOk) { + if (internal_ret != mindspore::internal::kInternalOk) { MS_LOG(ERROR) << "InternalKernel UpdateShape failed, kernel_name: " << kernel_name_; return KRET_RESIZE_FAILED; } @@ -345,8 +345,8 @@ bool InternalKernelMod::Launch(const std::vector &inputs, const UpdateAddr(inputs, outputs, workspace); // 使用动态加载的实现 - internal::InternalStatus status = + mindspore::internal::InternalStatus status = internal_op_->Launch(internal_inputs_addr_, internal_outputs_addr_, internal_wss_addr_, stream_ptr, fullname_); - return (status == internal::InternalStatus::kInternalOk); + return (status == mindspore::internal::InternalStatus::kInternalOk); } } // namespace ms_custom_ops diff --git a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h index 96ac2ef38..b51028798 100644 --- a/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h +++ b/ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h @@ -29,7 +29,7 @@ #include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" #include "mindspore/ccsrc/tools/profiler/profiling.h" #include "acl/acl_mdl.h" -#include "internal_kernels_loader.h" +#include "../adapter_loader.h" #include "dynamic_op_helper.h" namespace ms_custom_ops { @@ -60,8 +60,8 @@ class InternalKernelMod : public KernelMod { virtual bool UpdateParam(const std::vector &inputs, const std::vector &outputs) { return true; } - virtual DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + virtual DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) { return nullptr; @@ -75,11 +75,11 @@ class InternalKernelMod : public KernelMod { std::vector kernel_inputs_index_; std::vector kernel_outputs_index_; DynamicInternalOpPtr internal_op_{nullptr}; - internal::ShapeInfoList internal_inputs_shape_; - internal::ShapeInfoList internal_outputs_shape_; - internal::InputsAddrList internal_inputs_addr_; - internal::OutputsAddrList internal_outputs_addr_; - internal::WsAddrList internal_wss_addr_; + mindspore::internal::ShapeInfoList internal_inputs_shape_; + mindspore::internal::ShapeInfoList internal_outputs_shape_; + mindspore::internal::InputsAddrList internal_inputs_addr_; + mindspore::internal::OutputsAddrList internal_outputs_addr_; + mindspore::internal::WsAddrList internal_wss_addr_; private: std::shared_ptr ascend_profiler_{nullptr}; diff --git a/ccsrc/base/ms_kernels_internal/internal_helper.cc b/ccsrc/base/ms_kernels_internal/internal_helper.cc index 87f33aad0..19229171d 100644 --- a/ccsrc/base/ms_kernels_internal/internal_helper.cc +++ b/ccsrc/base/ms_kernels_internal/internal_helper.cc @@ -30,44 +30,44 @@ #include "mindspore/core/include/utils/log_adapter.h" namespace ms_custom_ops { -internal::DataType TransInternalDataType(TypeId ms_type) { - static const std::unordered_map kMSTypeToInternalType = { - {kNumberTypeFloat16, internal::DataType::kTypeFloat16}, - {kNumberTypeBFloat16, internal::DataType::kTypeBF16}, - {kNumberTypeFloat32, internal::DataType::kTypeFloat32}, - {kNumberTypeDouble, internal::DataType::kTypeFloat64}, - {kNumberTypeInt32, internal::DataType::kTypeInt32}, - {kNumberTypeUInt32, internal::DataType::kTypeUint32}, - {kNumberTypeInt16, internal::DataType::kTypeInt16}, - {kNumberTypeUInt16, internal::DataType::kTypeUint16}, - {kNumberTypeInt8, internal::DataType::kTypeInt8}, - {kNumberTypeUInt8, internal::DataType::kTypeUint8}, - {kNumberTypeInt64, internal::DataType::kTypeInt64}, - {kNumberTypeUInt64, internal::DataType::kTypeUint64}, - {kNumberTypeBool, internal::DataType::kTypeBool}, +mindspore::internal::DataType TransInternalDataType(TypeId ms_type) { + static const std::unordered_map kMSTypeToInternalType = { + {kNumberTypeFloat16, mindspore::internal::DataType::kTypeFloat16}, + {kNumberTypeBFloat16, mindspore::internal::DataType::kTypeBF16}, + {kNumberTypeFloat32, mindspore::internal::DataType::kTypeFloat32}, + {kNumberTypeDouble, mindspore::internal::DataType::kTypeFloat64}, + {kNumberTypeInt32, mindspore::internal::DataType::kTypeInt32}, + {kNumberTypeUInt32, mindspore::internal::DataType::kTypeUint32}, + {kNumberTypeInt16, mindspore::internal::DataType::kTypeInt16}, + {kNumberTypeUInt16, mindspore::internal::DataType::kTypeUint16}, + {kNumberTypeInt8, mindspore::internal::DataType::kTypeInt8}, + {kNumberTypeUInt8, mindspore::internal::DataType::kTypeUint8}, + {kNumberTypeInt64, mindspore::internal::DataType::kTypeInt64}, + {kNumberTypeUInt64, mindspore::internal::DataType::kTypeUint64}, + {kNumberTypeBool, mindspore::internal::DataType::kTypeBool}, }; auto iter = kMSTypeToInternalType.find(ms_type); if (iter == kMSTypeToInternalType.end()) { MS_LOG(INFO) << "Type " << ms_type << " is not supported in Internal"; - return internal::DataType::kTypeUnknown; + return mindspore::internal::DataType::kTypeUnknown; } return iter->second; } -internal::TensorFormat TransInternalFormat(Format format) { - static const std::unordered_map kMSFormatToInternalFormat = { - {DEFAULT_FORMAT, internal::TensorFormat::kFormatND}, - {NCHW, internal::TensorFormat::kFormatNCHW}, - {NHWC, internal::TensorFormat::kFormatNHWC}, - {ND, internal::TensorFormat::kFormatND}, - {NC1HWC0, internal::TensorFormat::kFormatNC1HWC0}, - {FRACTAL_NZ, internal::TensorFormat::kFormatFRACTAL_NZ}, - {NC1HWC0_C04, internal::TensorFormat::kFormatNC1HWC0_C04}, - {NDHWC, internal::TensorFormat::kFormatNDHWC}, - {NCDHW, internal::TensorFormat::kFormatNCDHW}, - {FRACTAL_Z_3D, internal::TensorFormat::kFormatFRACTAL_Z_3D}, +mindspore::internal::TensorFormat TransInternalFormat(Format format) { + static const std::unordered_map kMSFormatToInternalFormat = { + {DEFAULT_FORMAT, mindspore::internal::TensorFormat::kFormatND}, + {NCHW, mindspore::internal::TensorFormat::kFormatNCHW}, + {NHWC, mindspore::internal::TensorFormat::kFormatNHWC}, + {ND, mindspore::internal::TensorFormat::kFormatND}, + {NC1HWC0, mindspore::internal::TensorFormat::kFormatNC1HWC0}, + {FRACTAL_NZ, mindspore::internal::TensorFormat::kFormatFRACTAL_NZ}, + {NC1HWC0_C04, mindspore::internal::TensorFormat::kFormatNC1HWC0_C04}, + {NDHWC, mindspore::internal::TensorFormat::kFormatNDHWC}, + {NCDHW, mindspore::internal::TensorFormat::kFormatNCDHW}, + {FRACTAL_Z_3D, mindspore::internal::TensorFormat::kFormatFRACTAL_Z_3D}, }; auto iter = kMSFormatToInternalFormat.find(format); @@ -81,7 +81,7 @@ internal::TensorFormat TransInternalFormat(Format format) { case NDHWC: case NCDHW: // some op not support NCHW, NHWC, ... format, current return ND format - return internal::TensorFormat::kFormatND; + return mindspore::internal::TensorFormat::kFormatND; default: return iter->second; } diff --git a/ccsrc/base/ms_kernels_internal/internal_helper.h b/ccsrc/base/ms_kernels_internal/internal_helper.h index c9ee2d166..4e4247754 100644 --- a/ccsrc/base/ms_kernels_internal/internal_helper.h +++ b/ccsrc/base/ms_kernels_internal/internal_helper.h @@ -28,18 +28,18 @@ using namespace mindspore; namespace ms_custom_ops { -inline internal::ShapeInfo TransInternalShape(const ShapeVector &shape) { +inline mindspore::internal::ShapeInfo TransInternalShape(const ShapeVector &shape) { if (shape.size() != 0) { return shape; } - internal::ShapeInfo internal_shape{1}; + mindspore::internal::ShapeInfo internal_shape{1}; return internal_shape; } bool CheckDefaultSupportFormat(const std::string &format); -internal::DataType TransInternalDataType(TypeId ms_type); +mindspore::internal::DataType TransInternalDataType(TypeId ms_type); -internal::TensorFormat TransInternalFormat(Format format); +mindspore::internal::TensorFormat TransInternalFormat(Format format); } // namespace ms_custom_ops #endif // MS_CUSTOM_OPS_INTERNAL_HELPER_H_ diff --git a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc deleted file mode 100644 index 53d65be05..000000000 --- a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.cc +++ /dev/null @@ -1,278 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "internal_kernels_loader.h" -#include "dynamic_op_helper.h" -#include "mindspore/core/include/utils/log_adapter.h" -#include -#include -#include -#include -#include -#include -#include - -namespace ms_custom_ops { - -InternalKernelsLoader::~InternalKernelsLoader() { - if (library_handle_ != nullptr) { - dlclose(library_handle_); - library_handle_ = nullptr; - } -} - -bool InternalKernelsLoader::Initialize() { - if (initialized_) { - return true; - } - - if (!LoadLibrary()) { - return false; - } - - // 注意:不再加载通用的CreateInternalOp符号,因为动态库中只有具体的操作创建函数 - // 具体的函数将通过GetOpCreateFunction按需加载 - MS_LOG(INFO) << "Dynamic library loaded successfully, functions will be loaded on demand"; - - initialized_ = true; - return true; -} - -bool InternalKernelsLoader::LoadLibrary() { - // 使用多种路径策略加载库 - 加载真正包含CreateXxxOp函数的libms_kernels_internal.so - std::vector library_paths; - - // 1. 标准库路径搜索 (LD_LIBRARY_PATH) - library_paths.push_back("libms_kernels_internal.so"); - - // 2. 从环境变量获取MindSpore路径 - const char* ms_path = getenv("MINDSPORE_PATH"); - if (ms_path != nullptr) { - library_paths.push_back(std::string(ms_path) + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"); - } - - // 3. 运行时Python动态检测MindSpore路径 - std::string runtime_ms_path = GetMindSporePathAtRuntime(); - if (!runtime_ms_path.empty()) { - library_paths.push_back(runtime_ms_path + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"); - } - - for (const auto &path : library_paths) { - library_handle_ = dlopen(path.c_str(), RTLD_LAZY); - if (library_handle_ != nullptr) { - MS_LOG(INFO) << "Successfully loaded library from: " << path; - return true; - } - MS_LOG(DEBUG) << "Failed to load from: " << path << " - " << dlerror(); - } - - SetError("Failed to load library from all attempted paths. Last error: " + std::string(dlerror())); - return false; -} - -void *InternalKernelsLoader::GetFunctionPointer(const std::string &symbol_name) { - if (library_handle_ == nullptr) { - SetError("Library not loaded"); - return nullptr; - } - - dlerror(); // 清除之前的错误 - void *func_ptr = dlsym(library_handle_, symbol_name.c_str()); - char *error = dlerror(); - if (error != nullptr) { - SetError("Failed to get function pointer '" + symbol_name + "': " + std::string(error)); - return nullptr; - } - - return func_ptr; -} - -std::string InternalKernelsLoader::FindMangledSymbol(const std::string &func_name) { - if (library_handle_ == nullptr) { - SetError("Library not loaded"); - return ""; - } - - // 缓存符号查找结果 - static std::unordered_map symbol_cache; - auto cache_it = symbol_cache.find(func_name); - if (cache_it != symbol_cache.end()) { - return cache_it->second; - } - - // 方案1: 尝试已知的符号模式 - dlerror(); // 清除错误 - void* sym = dlsym(library_handle_, func_name.c_str()); - if (sym != nullptr) { - symbol_cache[func_name] = func_name; - return func_name; - } - - // 方案2: 调用外部 nm 命令来获取符号(运行时解析) - std::string lib_path = GetLibraryPath(); - if (!lib_path.empty()) { - std::string cmd = "nm -D " + lib_path + " 2>/dev/null | grep '" + func_name + "' | head -1 | awk '{print $3}'"; - FILE* pipe = popen(cmd.c_str(), "r"); - if (pipe) { - char buffer[1024]; - std::string result; - while (fgets(buffer, sizeof(buffer), pipe)) { - result += buffer; - } - pclose(pipe); - - // 去除换行符 - if (!result.empty() && result.back() == '\n') { - result.pop_back(); - } - - if (!result.empty()) { - // 验证找到的符号确实存在 - void* test_sym = dlsym(library_handle_, result.c_str()); - if (test_sym != nullptr) { - symbol_cache[func_name] = result; - MS_LOG(INFO) << "Found mangled symbol: " << result << " for " << func_name; - return result; - } - } - } - } - - MS_LOG(WARNING) << "Failed to find mangled symbol for: " << func_name; - return ""; // 未找到 -} - -uint64_t InternalKernelsLoader::CalcInternalOpApiHash(const std::string &op_name) { - if (!initialized_) { - SetError("Dynamic loader not initialized"); - return 0; - } - - if (calc_internal_op_api_hash_func_ == nullptr) { - SetError("CalcInternalOpApiHash function not available"); - return 0; - } - - return calc_internal_op_api_hash_func_(op_name); -} - -uint64_t InternalKernelsLoader::CalcInternalOpTilingHash( - const std::string &op_name, - const std::vector &input_shapes, - const std::vector &output_shapes) { - if (!initialized_) { - SetError("Dynamic loader not initialized"); - return 0; - } - - if (calc_internal_op_tiling_hash_func_ == nullptr) { - SetError("CalcInternalOpTilingHash function not available"); - return 0; - } - - return calc_internal_op_tiling_hash_func_(op_name, input_shapes, output_shapes); -} - -std::string InternalKernelsLoader::GetLibraryPath() { - // 使用 dladdr 获取已加载库的路径 - if (library_handle_ == nullptr) { - return ""; - } - - // 获取库中任意一个符号的地址信息 - void* symbol_addr = dlsym(library_handle_, "mindspore"); // 尝试一个通用符号 - if (!symbol_addr) { - // 如果没找到,尝试从 /proc/self/maps 解析 - FILE* maps = fopen("/proc/self/maps", "r"); - if (maps) { - char line[1024]; - while (fgets(line, sizeof(line), maps)) { - if (strstr(line, "libms_kernels_internal.so")) { - char* path = strrchr(line, ' '); - if (path) { - path++; // skip space - char* newline = strchr(path, '\n'); - if (newline) *newline = '\0'; - fclose(maps); - return std::string(path); - } - } - } - fclose(maps); - } - return ""; - } - - Dl_info info; - if (dladdr(symbol_addr, &info) && info.dli_fname) { - return std::string(info.dli_fname); - } - - return ""; -} - -std::string InternalKernelsLoader::GetMindSporePathAtRuntime() { - MS_LOG(DEBUG) << "Attempting to get MindSpore path at runtime"; - - // 执行Python命令获取MindSpore安装路径 - FILE* pipe = popen("python3 -c \"import mindspore as ms; import os; print(os.path.dirname(ms.__file__))\" 2>/dev/null", "r"); - if (pipe == nullptr) { - MS_LOG(ERROR) << "Failed to execute python3 command to get MindSpore path"; - return ""; - } - - char buffer[512]; - std::string result; - - // 读取命令输出 - if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - result = buffer; - // 移除换行符 - if (!result.empty() && result.back() == '\n') { - result.pop_back(); - } - MS_LOG(DEBUG) << "Python command returned path: " << result; - } else { - MS_LOG(ERROR) << "Failed to read output from python3 command"; - } - - pclose(pipe); - - if (result.empty()) { - MS_LOG(ERROR) << "Empty result from MindSpore path detection"; - return ""; - } - - // 验证路径是否包含mindspore且存在 - if (result.find("mindspore") != std::string::npos) { - std::string lib_path = result + "/lib/plugin/ascend/ms_kernels_internal/internal_kernel/lib/libms_kernels_internal.so"; - MS_LOG(DEBUG) << "Checking for library at: " << lib_path; - - // 检查文件是否存在 - if (access(lib_path.c_str(), F_OK) == 0) { - MS_LOG(INFO) << "Found MindSpore internal kernels library at: " << lib_path; - return result; - } else { - MS_LOG(ERROR) << "MindSpore internal kernels library not found at: " << lib_path; - } - } else { - MS_LOG(ERROR) << "Invalid MindSpore path (doesn't contain 'mindspore'): " << result; - } - - return ""; -} - -} // namespace ms_custom_ops \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h b/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h deleted file mode 100644 index 7956e297e..000000000 --- a/ccsrc/base/ms_kernels_internal/internal_kernels_loader.h +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Copyright 2025 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H -#define MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H - -#include -#include -#include -#include -#include -#include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" - -namespace ms_custom_ops { - -using namespace mindspore; - -class DynamicInternalOp; -using DynamicInternalOpPtr = std::shared_ptr; - -typedef uint64_t (*CalcInternalOpApiHashFunc)(const std::string &op_name); - -typedef uint64_t (*CalcInternalOpTilingHashFunc)(const std::string &op_name, - const std::vector &input_shapes, - const std::vector &output_shapes); - -/** - * @brief 简单的动态加载器类,用于加载ms_kernels_internal库 - */ -class InternalKernelsLoader { - public: - static InternalKernelsLoader &GetInstance() { - static InternalKernelsLoader instance; - return instance; - } - - /** - * @brief 初始化动态加载器 - * @return true 成功,false 失败 - */ - bool Initialize(); - - /** - * @brief 检查是否已初始化 - * @return true 已初始化,false 未初始化 - */ - bool IsInitialized() const { return initialized_; } - - /** - * @brief 获取错误信息 - * @return 错误信息 - */ - const std::string &GetLastError() const { return last_error_; } - - - /** - * @brief 计算内部操作API哈希值 - * @param op_name 操作名称 - * @return 哈希值 - */ - uint64_t CalcInternalOpApiHash(const std::string &op_name); - - /** - * @brief 计算内部操作Tiling哈希值 - * @param op_name 操作名称 - * @param input_shapes 输入形状列表 - * @param output_shapes 输出形状列表 - * @return 哈希值 - */ - uint64_t CalcInternalOpTilingHash(const std::string &op_name, - const std::vector &input_shapes, - const std::vector &output_shapes); - - /** - * @brief 获取特定操作创建函数指针 - * @param op_name 操作名称 (如 "CreateAsdReshapeAndCacheOp") - * @return 函数指针,失败返回nullptr - */ - template - FuncType GetOpCreateFunction(const std::string &op_name) { - if (!initialized_ && !Initialize()) { - return nullptr; - } - return reinterpret_cast(GetFunctionPointer(op_name)); - } - - /** - * @brief 获取函数指针 - * @param symbol_name 符号名称 - * @return 函数指针 - */ - void *GetFunctionPointer(const std::string &symbol_name); - - /** - * @brief 通过模式匹配查找mangled符号名 - * @param func_name 函数名 (如 "CreateApplyRotaryPosEmbOp") - * @return mangled符号名,找不到返回空字符串 - */ - std::string FindMangledSymbol(const std::string &func_name); - - private: - InternalKernelsLoader() = default; - ~InternalKernelsLoader(); - InternalKernelsLoader(const InternalKernelsLoader &) = delete; - InternalKernelsLoader &operator=(const InternalKernelsLoader &) = delete; - - /** - * @brief 加载库 - * @return true 成功,false 失败 - */ - bool LoadLibrary(); - - /** - * @brief 运行时获取MindSpore安装路径 - * @return MindSpore安装路径,失败返回空字符串 - */ - std::string GetMindSporePathAtRuntime(); - - /** - * @brief 获取当前加载库的文件路径 - * @return 库文件路径,失败返回空字符串 - */ - std::string GetLibraryPath(); - - /** - * @brief 设置错误信息 - * @param error 错误信息 - */ - void SetError(const std::string &error) { last_error_ = error; } - - void *library_handle_ = nullptr; - bool initialized_ = false; - std::string last_error_; - - // 函数指针 - CalcInternalOpApiHashFunc calc_internal_op_api_hash_func_ = nullptr; - CalcInternalOpTilingHashFunc calc_internal_op_tiling_hash_func_ = nullptr; -}; - -} // namespace ms_custom_ops - -#endif // MS_CUSTOM_OPS_SIMPLE_DYNAMIC_LOADER_H \ No newline at end of file diff --git a/ccsrc/base/ms_kernels_internal/internal_tiling_cache.h b/ccsrc/base/ms_kernels_internal/internal_tiling_cache.h index d4540dc91..0059c61e5 100644 --- a/ccsrc/base/ms_kernels_internal/internal_tiling_cache.h +++ b/ccsrc/base/ms_kernels_internal/internal_tiling_cache.h @@ -111,11 +111,11 @@ void GatherHash(const T &arg, const Args &...args) { struct TilingCacheItem { std::atomic ref_count_{0}; - internal::TilingInfoPtr tiling_info_; + mindspore::internal::TilingInfoPtr tiling_info_; void *host_addr_; size_t size_; - TilingCacheItem(const internal::TilingInfoPtr &tiling_info, void *host_addr, size_t size) + TilingCacheItem(const mindspore::internal::TilingInfoPtr &tiling_info, void *host_addr, size_t size) : ref_count_(1), tiling_info_(tiling_info), host_addr_(host_addr), size_(size) {} }; using TilingCacheItemPtr = std::shared_ptr; diff --git a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc index 09eebef2f..056115e39 100644 --- a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc +++ b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.cc @@ -15,13 +15,13 @@ */ #include "internal_pyboost_runner.h" -#include "internal_kernels_loader.h" +#include "../adapter_loader.h" namespace ms_custom_ops { void InternalPyboostRunner::GetOrCreateKernel(const TensorList &inputs, const TensorList &outputs) { // 使用动态加载的实现 - auto &loader = ms_custom_ops::InternalKernelsLoader::GetInstance(); + auto &loader = ms_custom_ops::AdapterLoader::GetInstance(); if (loader.Initialize()) { auto key = GetOrGenerateOpKey(op_key_); auto it = hash_map_.find(key); @@ -67,7 +67,7 @@ void InternalPyboostRunner::GetOrCreateKernel(const TensorList &inputs, tiling_cache_item_ = GetOrGenerateTiling(); } else { - MS_LOG(ERROR) << "Failed to initialize dynamic loader: " << loader.GetLastError(); + MS_LOG(ERROR) << "Failed to initialize dynamic loader"; } } @@ -112,8 +112,7 @@ TilingCacheItemPtr InternalPyboostRunner::GetOrGenerateTiling() { auto host_addr = TilingMemMgr::GetInstance().pool_host_.Malloc(tiling_size); mindspore::internal::HostRunInfoPtr host_run_info_ptr = nullptr; auto status = internal_op_->Tiling(host_addr, &host_run_info_ptr); - if (status != mindspore::internal::kInternalOk || - host_run_info_ptr == nullptr) { + if (status != mindspore::internal::kInternalOk || host_run_info_ptr == nullptr) { MS_LOG(EXCEPTION) << "Tiling error for " << this->op_name() << ", status: " << status << ", host_run_info_ptr: " << host_run_info_ptr; @@ -121,8 +120,7 @@ TilingCacheItemPtr InternalPyboostRunner::GetOrGenerateTiling() { auto device_addr = TilingMemMgr::GetInstance().pool_device_.Malloc(tiling_size); TilingMemMgr::GetInstance().CopyAsync(host_addr, device_addr, tiling_size); - auto tiling_info = - std::make_shared(device_addr, nullptr); + auto tiling_info = std::make_shared(device_addr, nullptr); tiling_info->host_run_info_ = host_run_info_ptr; auto workspace_size_list = internal_op_->GetWorkspaceSize(); tiling_info->host_run_info_->SetWorkSpaceSize(workspace_size_list); @@ -153,9 +151,8 @@ TilingCacheItemPtr InternalPyboostRunner::GetOrGenerateTiling() { return tiling_info_ptr; } -void InternalPyboostRunner::TransInternalShapes( - mindspore::internal::ShapeInfoList *shapelist, const TensorList &tensorlist, - bool is_input) { +void InternalPyboostRunner::TransInternalShapes(mindspore::internal::ShapeInfoList *shapelist, + const TensorList &tensorlist, bool is_input) { for (size_t i = 0; i < tensorlist.size(); i++) { if (!tensorlist[i].is_defined()) { shapelist->at(i) = mindspore::internal::ShapeInfo{}; @@ -178,19 +175,18 @@ void InternalPyboostRunner::TransInternalShapes( } } - auto shape = tensorlist[i].data_type() != kMetaTypeNone - ? TransInternalShape(tensorlist[i].shape()) - : mindspore::internal::ShapeInfo{0}; + auto shape = tensorlist[i].data_type() != kMetaTypeNone ? TransInternalShape(tensorlist[i].shape()) + : mindspore::internal::ShapeInfo{0}; shapelist->at(i) = std::move(shape); } } void InternalPyboostRunner::UpdateArgImmutableInfo( - internal::ArgImmutableInfo *arginfo, const ms::Tensor &tensor, - internal::DataType dtype) { + mindspore::internal::ArgImmutableInfo *arginfo, const ms::Tensor &tensor, + mindspore::internal::DataType dtype) { arginfo->SetDtype(dtype); if (!tensor.is_defined()) { - arginfo->SetFormat(internal::TensorFormat::kFormatND); + arginfo->SetFormat(mindspore::internal::TensorFormat::kFormatND); return; } arginfo->SetFormat( @@ -198,7 +194,7 @@ void InternalPyboostRunner::UpdateArgImmutableInfo( } void InternalPyboostRunner::UpdateArgImmutableInfo( - std::vector *arginfos, + std::vector *arginfos, const TensorList &tensorlist, bool is_input) { arginfos->resize(tensorlist.size()); for (size_t i = 0; i < tensorlist.size(); ++i) { @@ -214,7 +210,7 @@ void InternalPyboostRunner::UpdateArgImmutableInfo( void InternalPyboostRunner::GetWorkspace( const DynamicInternalOpPtr &internal_op, - internal::WsAddrList *internal_wss_addr) { + mindspore::internal::WsAddrList *internal_wss_addr) { auto workspace_ptr = this->workspace_ptr(); if (workspace_ptr == nullptr) { return; @@ -236,22 +232,22 @@ void InternalPyboostRunner::LaunchKernel() { MS_EXCEPTION_IF_NULL(internal_op_); // 使用动态加载的实现 - internal::InputsAddrList inputs_addr; - internal::OutputsAddrList outputs_addr; + mindspore::internal::InputsAddrList inputs_addr; + mindspore::internal::OutputsAddrList outputs_addr; InternalPyboostRunner::UpdateAddr(&inputs_addr, this->inputs()); InternalPyboostRunner::UpdateAddr(&outputs_addr, this->outputs()); - internal::WsAddrList _internal_wss_addr; + mindspore::internal::WsAddrList _internal_wss_addr; InternalPyboostRunner::GetWorkspace(internal_op_, &_internal_wss_addr); auto op_name = this->op_name(); MS_LOG(DEBUG) << "Launch InternalKernel " << op_name << " start"; internal_op_->SetTilingInfo(tiling_cache_item_->tiling_info_); auto &internal_wss_addr = - const_cast(_internal_wss_addr); - internal::InternalStatus status = internal_op_->Launch( + const_cast(_internal_wss_addr); + mindspore::internal::InternalStatus status = internal_op_->Launch( inputs_addr, outputs_addr, internal_wss_addr, this->stream(), op_name); InternalTilingCache::GetInstance().Unbind(tiling_cache_item_); - if (status != internal::InternalStatus::kInternalOk) { + if (status != mindspore::internal::InternalStatus::kInternalOk) { MS_LOG(EXCEPTION) << "Launch InternalKernel failed, kernel_name: " << op_name; } diff --git a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h index 4baf356f1..89055296e 100644 --- a/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h +++ b/ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h @@ -25,7 +25,7 @@ #include "internal_pyboost_utils.h" #include "internal_spinlock.h" #include "internal_tiling_cache.h" -#include "internal_kernels_loader.h" +#include "../adapter_loader.h" #include "module.h" #include "mindspore/ccsrc/ms_extension/api.h" #include "lib/plugin/ascend/ms_kernels_internal/internal_kernel/include/internal.h" @@ -46,11 +46,11 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { template void Setup(const std::string &op_name, const Args &...args) { // Load internal kernels library early to initialize device environment - auto& loader = InternalKernelsLoader::GetInstance(); - if (!loader.IsInitialized()) { + auto &loader = AdapterLoader::GetInstance(); + if (!loader.IsLoaded()) { MS_LOG(INFO) << "Loading internal kernels library during InternalPyboostRunner::Setup"; if (!loader.Initialize()) { - MS_LOG(ERROR) << "Failed to load internal kernels library in Setup: " << loader.GetLastError(); + MS_LOG(ERROR) << "Failed to load internal kernels library in Setup"; return; } } @@ -75,11 +75,12 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { void TransDataType(const TensorList &ms_inputs, const TensorList &ms_outputs); TilingCacheItemPtr GetOrGenerateTiling(); - virtual DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) = 0; - void TransInternalShapes(internal::ShapeInfoList *shapelist, const TensorList &tensorlist, bool is_input = false); + virtual DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) = 0; + void TransInternalShapes(mindspore::internal::ShapeInfoList *shapelist, const TensorList &tensorlist, + bool is_input = false); - static void UpdateAddr(std::vector *addrlist, const TensorList &tensorlist) { + static void UpdateAddr(std::vector *addrlist, const TensorList &tensorlist) { addrlist->resize(tensorlist.size()); for (size_t i = 0; i < tensorlist.size(); i++) { if (!tensorlist[i].is_defined()) { @@ -90,7 +91,7 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { } } - void GetWorkspace(const DynamicInternalOpPtr &internal_op, internal::WsAddrList *internal_wss_addr); + void GetWorkspace(const DynamicInternalOpPtr &internal_op, mindspore::internal::WsAddrList *internal_wss_addr); void LaunchKernel() override; @@ -98,18 +99,19 @@ class InternalPyboostRunner : public ms::pynative::PyboostRunner { uint64_t tiling_key_{0}; DynamicInternalOpPtr internal_op_{nullptr}; inline static std::unordered_map hash_map_; - internal::DtypeInfoList internal_inputs_dtype_; - internal::DtypeInfoList internal_outputs_dtype_; - internal::ShapeInfoList internal_inputs_shape_; - internal::ShapeInfoList internal_outputs_shape_; - internal::InputsImmutableInfoList inputs_ii_; - internal::OutputsImmutableInfoList outputs_ii_; + mindspore::internal::DtypeInfoList internal_inputs_dtype_; + mindspore::internal::DtypeInfoList internal_outputs_dtype_; + mindspore::internal::ShapeInfoList internal_inputs_shape_; + mindspore::internal::ShapeInfoList internal_outputs_shape_; + mindspore::internal::InputsImmutableInfoList inputs_ii_; + mindspore::internal::OutputsImmutableInfoList outputs_ii_; TilingCacheItemPtr tiling_cache_item_{nullptr}; private: - void UpdateArgImmutableInfo(internal::ArgImmutableInfo *arginfo, const ms::Tensor &tensor, internal::DataType dtype); - void UpdateArgImmutableInfo(std::vector *arginfos, const TensorList &tensorlist, - bool is_input = false); + void UpdateArgImmutableInfo(mindspore::internal::ArgImmutableInfo *arginfo, const ms::Tensor &tensor, + mindspore::internal::DataType dtype); + void UpdateArgImmutableInfo(std::vector *arginfos, + const TensorList &tensorlist, bool is_input = false); SimpleSpinLock lock_; }; diff --git a/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc b/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc index 4028821fa..bf91a9333 100644 --- a/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc +++ b/ccsrc/ops/ms_kernels_internal/apply_rotary_pos_emb/apply_rotary_pos_emb.cc @@ -82,11 +82,11 @@ class CustomApplyRotaryPosEmb : public InternalKernelMod { } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { - internal::ApplyRotaryPosEmbParam param; + mindspore::internal::ApplyRotaryPosEmbParam param; auto cos_format = ms_inputs.at(static_cast(ApplyRotaryPosEmbQueryInputIndex::kApplyRotaryPosEmbCosFormatIndex)); if (cos_format->dtype_id() == TypeId::kNumberTypeInt64) { @@ -96,7 +96,7 @@ class CustomApplyRotaryPosEmb : public InternalKernelMod { << cos_format->dtype_id(); } return CALL_DYNAMIC_OP_INTERNAL(CreateApplyRotaryPosEmbOp, ApplyRotaryPosEmbParam, inputs, outputs, param, - internal::kInternalApplyRotaryPosEmbOpName); + mindspore::internal::kInternalApplyRotaryPosEmbOpName); } }; } // namespace ms_custom_ops @@ -119,12 +119,12 @@ class ApplyRotaryPosEmbRunner : public InternalPyboostRunner { void SetCosFormat(const int32_t &cos_format) { this->cos_format_ = cos_format; } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { - internal::ApplyRotaryPosEmbParam param; + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { + mindspore::internal::ApplyRotaryPosEmbParam param; param.cos_format = this->cos_format_; return CALL_DYNAMIC_OP_INTERNAL(CreateApplyRotaryPosEmbOp, ApplyRotaryPosEmbParam, inputs, outputs, param, - internal::kInternalApplyRotaryPosEmbOpName); + mindspore::internal::kInternalApplyRotaryPosEmbOpName); } private: diff --git a/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc b/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc index 2b64b3b32..09973af08 100644 --- a/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/mla/mla_graph.cc @@ -198,16 +198,16 @@ class Mla : public InternalKernelMod { ~Mla() = default; protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { - param_.type = internal::MLAParam::kSplitCache; + param_.type = mindspore::internal::MLAParam::kSplitCache; param_.head_size = static_cast(ms_inputs[kMlaInputNumHeadIndex]->GetValueWithCheck()); param_.tor = ms_inputs[kMlaInputScaleValueIndex]->GetValueWithCheck(); param_.kv_head = static_cast(ms_inputs[kMlaInputNumKVHeadIndex]->GetValueWithCheck()); param_.mask_type = - static_cast(ms_inputs[kMlaInputMaskTypeIndex]->GetValueWithCheck()); + static_cast(ms_inputs[kMlaInputMaskTypeIndex]->GetValueWithCheck()); param_.is_ring = static_cast(ms_inputs[kMlaInputIsRingIndex]->GetValueWithCheck()); param_.q_seq_len = ms_inputs[kMlaInputQueryLensIndex]->GetValueWithCheck>(); @@ -217,12 +217,12 @@ class Mla : public InternalKernelMod { created_flag_ = true; if (input_format == kKVFormatNZ) { auto inputs_new = inputs; - inputs_new[kMlaInputKvCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_new[kMlaInputKropeIndex].SetFormat(internal::kFormatFRACTAL_NZ); - return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, internal::kInternalMLAOpName); + inputs_new[kMlaInputKvCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + inputs_new[kMlaInputKropeIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, mindspore::internal::kInternalMLAOpName); } - return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, mindspore::internal::kInternalMLAOpName); } bool UpdateParam(const std::vector &inputs, const std::vector &outputs) override { @@ -237,7 +237,7 @@ class Mla : public InternalKernelMod { auto kv_need_recreate = GetSeqLenAndCheckUpdate(inputs[kMlaInputContextLensIndex], ¶m_.kv_seq_len); if (q_need_recreate || kv_need_recreate) { auto ret = internal_op_->UpdateParam(¶m_); - if (ret != internal::kInternalOk) { + if (ret != mindspore::internal::kInternalOk) { MS_LOG(ERROR) << "InternalMla UpdateParam failed, kernel_name: " << kernel_name_; return false; } @@ -262,7 +262,7 @@ class Mla : public InternalKernelMod { private: bool created_flag_{false}; - internal::MLAParam param_; + mindspore::internal::MLAParam param_; }; } // namespace ms_custom_ops diff --git a/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc b/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc index c9c4928d2..3438c386f 100644 --- a/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/mla/mla_pynative.cc @@ -57,7 +57,7 @@ class MlaRunner : public InternalPyboostRunner { if (need_update_param_) { auto ret = internal_op_->UpdateParam(¶m_); - if (ret != internal::kInternalOk) { + if (ret != mindspore::internal::kInternalOk) { MS_LOG(ERROR) << "InternalMla UpdateParam failed in MlaRunner."; return false; } @@ -65,16 +65,16 @@ class MlaRunner : public InternalPyboostRunner { } } - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { created_flag_ = true; if (input_format_ == kKVFormatNZ) { auto inputs_new = inputs; - inputs_new[kMlaInputKvCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_new[kMlaInputKropeIndex].SetFormat(internal::kFormatFRACTAL_NZ); - return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, internal::kInternalMLAOpName); + inputs_new[kMlaInputKvCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + inputs_new[kMlaInputKropeIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs_new, outputs, param_, mindspore::internal::kInternalMLAOpName); } - return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, internal::kInternalMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMLAOp, MLAParam, inputs, outputs, param_, mindspore::internal::kInternalMLAOpName); } private: diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h index e53175e41..85ba41415 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_common.h @@ -64,17 +64,17 @@ constexpr int64_t kMlaPreCacheModeQK = 0; constexpr int64_t kMlaPreCacheModeQKSplitQuant = 2; constexpr int64_t kMlaPreCacheModeQKSplitNz = 3; -inline DynamicInternalOpPtr CreateMlaPreprocessOpWithFormat(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const internal::MlaPreprocessParam ¶m) { +inline DynamicInternalOpPtr CreateMlaPreprocessOpWithFormat(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, + const mindspore::internal::MlaPreprocessParam ¶m) { auto inputs_clone = inputs; - inputs_clone[kMlaPreprocessWdqkvIndex].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_clone[kMlaPreprocessWuqIndex].SetFormat(internal::kFormatFRACTAL_NZ); + inputs_clone[kMlaPreprocessWdqkvIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + inputs_clone[kMlaPreprocessWuqIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); if (param.cache_mode == kMlaPreCacheModeQKSplitQuant || param.cache_mode == kMlaPreCacheModeQKSplitNz) { - inputs_clone[kMlaPreprocessKeyCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_clone[kMlaPreprocessKropeCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); + inputs_clone[kMlaPreprocessKeyCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + inputs_clone[kMlaPreprocessKropeCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); } - return CALL_DYNAMIC_OP_INTERNAL(CreateMlaPreprocessOp, MlaPreprocessParam, inputs_clone, outputs, param, internal::kInternalMlaPreprocessOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMlaPreprocessOp, MlaPreprocessParam, inputs_clone, outputs, param, mindspore::internal::kInternalMlaPreprocessOpName); }; diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc index 5013ff218..2eaaaeeb7 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_graph.cc @@ -70,11 +70,11 @@ public: } protected: DynamicInternalOpPtr - CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { - internal::MlaPreprocessParam param; + mindspore::internal::MlaPreprocessParam param; auto cache_mode = ms_inputs.at(kMlaPreprocessParamCacheModeIndex); if (cache_mode->dtype_id() == TypeId::kNumberTypeInt64) { param.n = 0; diff --git a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc index e85a7daf1..ed5f2d236 100644 --- a/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/mla_preprocess/mla_preprocess_pynative.cc @@ -27,11 +27,11 @@ class MlaPreprocessLoadRunner : public InternalPyboostRunner { public: using InternalPyboostRunner::InternalPyboostRunner; void SetParamCacheMode(const int32_t &cache_mode) { this->cache_mode_ = cache_mode; } - internal::MlaPreprocessParam param_; + mindspore::internal::MlaPreprocessParam param_; protected: DynamicInternalOpPtr - CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { return CreateMlaPreprocessOpWithFormat(inputs, outputs, param_); } diff --git a/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc b/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc index 956515632..c91b71431 100644 --- a/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc +++ b/ccsrc/ops/ms_kernels_internal/moe_gating_group_topk/moe_gating_group_topk.cc @@ -102,11 +102,11 @@ class CustomMoeGatingGroupTopK : public InternalKernelMod { } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { - internal::MoeGatingGroupTopKParam param; + mindspore::internal::MoeGatingGroupTopKParam param; auto k = ms_inputs.at(kIndex2); auto k_group = ms_inputs.at(kIndex3); auto group_count = ms_inputs.at(kIndex4); @@ -144,7 +144,7 @@ class CustomMoeGatingGroupTopK : public InternalKernelMod { << TypeIdToString(out_flag->dtype_id()) << ", " << TypeIdToString(routed_scaling_factor->dtype_id()) << ", " << TypeIdToString(eps->dtype_id()) << "]"; } - return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param, internal::kInternalMoeGatingGroupTopKOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param, mindspore::internal::kInternalMoeGatingGroupTopKOpName); } }; } // namespace ms_custom_ops @@ -179,13 +179,13 @@ class MoeGatingGroupTopKRunner : public InternalPyboostRunner { } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { - return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param_, internal::kInternalMoeGatingGroupTopKOpName); + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { + return CALL_DYNAMIC_OP_INTERNAL(CreateMoeGatingGroupTopKOp, MoeGatingGroupTopKParam, inputs, outputs, param_, mindspore::internal::kInternalMoeGatingGroupTopKOpName); } private: - internal::MoeGatingGroupTopKParam param_; + mindspore::internal::MoeGatingGroupTopKParam param_; }; } // namespace ms::pynative diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h index a46e71f7c..2a358abc1 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h @@ -41,18 +41,18 @@ enum PagedCacheLoadOutputIndex : size_t { kPCLOutputsNum }; -inline DynamicInternalOpPtr CreatePagedCacheLoadOpWithFormat(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const internal::PagedCacheLoadParam ¶m) { +inline DynamicInternalOpPtr CreatePagedCacheLoadOpWithFormat(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, + const mindspore::internal::PagedCacheLoadParam ¶m) { if (param.kv_cache_cfg_type == 1) { auto inputs_clone = inputs; - inputs_clone[kPCLInputKeyCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_clone[kPCLInputValueCacheIndex].SetFormat(internal::kFormatFRACTAL_NZ); + inputs_clone[kPCLInputKeyCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + inputs_clone[kPCLInputValueCacheIndex].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); return CALL_DYNAMIC_OP_INTERNAL(CreatePagedCacheLoadOp, PagedCacheLoadParam, inputs_clone, outputs, param, - internal::kInternalPagedCacheLoadOpName); + mindspore::internal::kInternalPagedCacheLoadOpName); } return CALL_DYNAMIC_OP_INTERNAL(CreatePagedCacheLoadOp, PagedCacheLoadParam, inputs, outputs, param, - internal::kInternalPagedCacheLoadOpName); + mindspore::internal::kInternalPagedCacheLoadOpName); }; } // namespace ms_custom_ops #endif diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc index 4eda56fff..9bc4cd72a 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc @@ -81,11 +81,11 @@ public: protected: DynamicInternalOpPtr - CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { - internal::PagedCacheLoadParam param; + mindspore::internal::PagedCacheLoadParam param; auto kv_cache_cfg_type = ms_inputs.at(kPCLInputParamKvCacheCfgIndex); auto is_seq_lens_cumsum_type = ms_inputs.at(kPCLInputParamIsSeqLensCumsumTypeIndex); auto has_seq_starts = ms_inputs.at(kPCLInputParamHasSeqStartsIndex); diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc index c989c8ded..a1ca3a122 100644 --- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc +++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc @@ -31,11 +31,11 @@ public: this->is_seq_lens_cumsum_type_ = is_seq_lens_cumsum_type; } void SetHasSeqStarts(const bool &has_seq_starts) { this->has_seq_starts_ = has_seq_starts; } - internal::PagedCacheLoadParam param_; + mindspore::internal::PagedCacheLoadParam param_; protected: DynamicInternalOpPtr - CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { return CreatePagedCacheLoadOpWithFormat(inputs, outputs, param_); } diff --git a/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc b/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc index cb0c2a56e..e52e6bce3 100644 --- a/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc +++ b/ccsrc/ops/ms_kernels_internal/reshape_and_cache/reshape_and_cache.cc @@ -20,7 +20,7 @@ #include #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h" -#include "ccsrc/base/ms_kernels_internal/internal_kernels_loader.h" +#include "ccsrc/base/ms_kernels_internal/adapter_loader.h" #include "ccsrc/utils/utils.h" #include "mindspore/core/include/mindapi/ir/tensor.h" #include "mindspore/ops/kernel/ascend/acl_ir/acl_convert.h" @@ -52,19 +52,21 @@ enum class InputIndex : size_t { enum class OutputIndex : size_t { kOutputIndex = 0 }; -inline DynamicInternalOpPtr CreateReshapeAndCacheOpWithFormat(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const internal::ReshapeAndCacheParam ¶m, - int32_t cache_mode) { +inline DynamicInternalOpPtr CreateReshapeAndCacheOpWithFormat( + const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const mindspore::internal::ReshapeAndCacheParam ¶m, + int32_t cache_mode) { if (cache_mode == static_cast(CacheMode::NZ)) { auto inputs_clone = inputs; - inputs_clone[static_cast(InputIndex::kInputKeyCacheIndex)].SetFormat(internal::kFormatFRACTAL_NZ); - inputs_clone[static_cast(InputIndex::kInputValueCacheIndex)].SetFormat(internal::kFormatFRACTAL_NZ); - return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, - inputs_clone, outputs, param, internal::kInternalAsdReshapeAndCacheOpName); + inputs_clone[static_cast(InputIndex::kInputKeyCacheIndex)].SetFormat( + mindspore::internal::kFormatFRACTAL_NZ); + inputs_clone[static_cast(InputIndex::kInputValueCacheIndex)].SetFormat( + mindspore::internal::kFormatFRACTAL_NZ); + return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, inputs_clone, outputs, param, + mindspore::internal::kInternalAsdReshapeAndCacheOpName); } - return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, - inputs, outputs, param, internal::kInternalAsdReshapeAndCacheOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateAsdReshapeAndCacheOp, ReshapeAndCacheParam, inputs, outputs, param, + mindspore::internal::kInternalAsdReshapeAndCacheOpName); } // ============================================================================= @@ -128,11 +130,11 @@ class CustomReshapeAndCache : public InternalKernelMod { } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, - const std::vector &ms_inputs, - const std::vector &ms_outputs) override { - internal::ReshapeAndCacheParam param; + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, + const std::vector &ms_inputs, + const std::vector &ms_outputs) override { + mindspore::internal::ReshapeAndCacheParam param; auto head_num = ms_inputs.at(static_cast(InputIndex::kInputHeadNumIndex)); if (head_num->dtype_id() == TypeId::kNumberTypeInt64) { param.head_num = static_cast(head_num->GetValue().value()); @@ -174,9 +176,9 @@ class ReshapeAndCacheRunner : public InternalPyboostRunner { void SetCacheMode(const int32_t &cache_mode) { this->cache_mode_ = cache_mode; } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { - internal::ReshapeAndCacheParam param; + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { + mindspore::internal::ReshapeAndCacheParam param; param.head_num = this->head_num_; return CreateReshapeAndCacheOpWithFormat(inputs, outputs, param, this->cache_mode_); diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc index dc4b00805..38bc2e227 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.cc @@ -85,9 +85,9 @@ void CustomRingMLAOpFuncImpl::CheckInputShape(const PrimitivePtr &primitive, ShapeArray CustomRingMLAOpFuncImpl::InferShape(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const { - auto calc_type = static_cast( + auto calc_type = static_cast( input_infos[kCalcTypeIdx]->GetScalarValueWithCheck()); - is_input_softmax_lse_ = (calc_type == internal::RingMLAParam::CalcType::CALC_TYPE_DEFAULT); + is_input_softmax_lse_ = (calc_type == mindspore::internal::RingMLAParam::CalcType::CALC_TYPE_DEFAULT); (void)CheckInputShape(primitive, input_infos); const auto &query_shape = input_infos[kQueryIdx]->GetShape(); const auto &value_shape = input_infos[kValueIdx]->GetShape(); @@ -112,9 +112,9 @@ std::vector CustomRingMLAOpFuncImpl::InferType(const PrimitivePtr &primi return {query_type, TypeId::kNumberTypeFloat32}; } -bool CustomRingMLA::RingMLAParamCheck(const internal::RingMLAParam &op_param) { - if (op_param.calcType != internal::RingMLAParam::CalcType::CALC_TYPE_DEFAULT && - op_param.calcType != internal::RingMLAParam::CalcType::CALC_TYPE_FISRT_RING) { +bool CustomRingMLA::RingMLAParamCheck(const mindspore::internal::RingMLAParam &op_param) { + if (op_param.calcType != mindspore::internal::RingMLAParam::CalcType::CALC_TYPE_DEFAULT && + op_param.calcType != mindspore::internal::RingMLAParam::CalcType::CALC_TYPE_FISRT_RING) { MS_LOG(ERROR) << "Ring MLA expects calcType to be one of CALC_TYPE_DEFAULT, CALC_TYPE_FISRT_RING. " << "But got param.calcType = " << op_param.calcType; return false; @@ -140,18 +140,18 @@ bool CustomRingMLA::RingMLAParamCheck(const internal::RingMLAParam &op_param) { << ", param.kvHeadNum = " << op_param.kvHeadNum; return false; } - if (op_param.maskType != internal::RingMLAParam::MaskType::NO_MASK && - op_param.maskType != internal::RingMLAParam::MaskType::MASK_TYPE_TRIU) { + if (op_param.maskType != mindspore::internal::RingMLAParam::MaskType::NO_MASK && + op_param.maskType != mindspore::internal::RingMLAParam::MaskType::MASK_TYPE_TRIU) { MS_LOG(ERROR) << "Ring MLA expects maskType as one of NO_MASK, MASK_TYPE_TRIU, " << "But got param.maskType = " << op_param.maskType; return false; } - if (op_param.inputLayout != internal::RingMLAParam::InputLayout::TYPE_BSND) { + if (op_param.inputLayout != mindspore::internal::RingMLAParam::InputLayout::TYPE_BSND) { MS_LOG(ERROR) << "Ring MLA only supports inputLayout as TYPE_BSND, " << "But got param.inputLayout = " << op_param.inputLayout; return false; } - if (op_param.kernelType != internal::RingMLAParam::KernelType::KERNELTYPE_HIGH_PRECISION) { + if (op_param.kernelType != mindspore::internal::RingMLAParam::KernelType::KERNELTYPE_HIGH_PRECISION) { MS_LOG(ERROR) << "Ring MLA only supports kernelType as KERNELTYPE_HIGH_PRECISION, " << "But got param.kernelType = " << op_param.kernelType; return false; @@ -220,17 +220,17 @@ static bool GetSeqLenFromInputAndCheckUpdate(const std::string &kernel_name, con return true; } -DynamicInternalOpPtr CustomRingMLA::CreateKernel(const internal::InputsImmutableInfoList &inputs_ii, - const internal::OutputsImmutableInfoList &outputs_ii, +DynamicInternalOpPtr CustomRingMLA::CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs_ii, + const mindspore::internal::OutputsImmutableInfoList &outputs_ii, const std::vector &ms_inputs, const std::vector &ms_outputs) { // Extract and set all required parameters from ms_inputs param_.headNum = static_cast(ms_inputs[kHeadNumIdx]->GetValueWithCheck()); param_.qkScale = ms_inputs[kQkScaleIdx]->GetValueWithCheck(); param_.kvHeadNum = static_cast(ms_inputs[kKvHeadNumIdx]->GetValueWithCheck()); - param_.maskType = static_cast( + param_.maskType = static_cast( ms_inputs[kMaskTypeIdx]->GetValueWithCheck()); - param_.calcType = static_cast( + param_.calcType = static_cast( ms_inputs[kCalcTypeIdx]->GetValueWithCheck()); // Update sequence lengths from input tensors @@ -243,7 +243,7 @@ DynamicInternalOpPtr CustomRingMLA::CreateKernel(const internal::InputsImmutable "parameters, kernel_name: ", kernel_name_)); created_flag_ = true; - return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs_ii, outputs_ii, param_, internal::kInternalRingMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs_ii, outputs_ii, param_, mindspore::internal::kInternalRingMLAOpName); } bool CustomRingMLA::UpdateParam(const std::vector &inputs, @@ -261,7 +261,7 @@ bool CustomRingMLA::UpdateParam(const std::vector &inputs, inputs[kKVSeqLenIdx], ¶m_.kvSeqLen); if (q_need_update || kv_need_update) { auto ret = internal_op_->UpdateParam(¶m_); - if (ret != internal::kInternalOk) { + if (ret != mindspore::internal::kInternalOk) { MS_LOG(ERROR) << "CustomRingMLA UpdateParam failed, kernel_name: " << kernel_name_; return false; } diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h index c399489c7..09998b7ec 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla.h @@ -108,8 +108,8 @@ class CustomRingMLA : public InternalKernelMod { protected: DynamicInternalOpPtr CreateKernel( - const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override; bool UpdateParam(const std::vector &inputs, @@ -117,9 +117,9 @@ class CustomRingMLA : public InternalKernelMod { uint64_t GenerateTilingKey(const std::vector &inputs) override; private: - bool RingMLAParamCheck(const internal::RingMLAParam &op_param); + bool RingMLAParamCheck(const mindspore::internal::RingMLAParam &op_param); bool created_flag_{false}; - internal::RingMLAParam param_; + mindspore::internal::RingMLAParam param_; }; } // namespace ms_custom_ops diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc index 9c06fade7..8d1200524 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.cc @@ -57,8 +57,8 @@ void RingMLARunner::SetRingMLAParam(int64_t head_num, float scale_value, int64_t param_.headNum = static_cast(head_num); param_.qkScale = scale_value; param_.kvHeadNum = static_cast(kv_head_num); - param_.maskType = static_cast(mask_type); - param_.calcType = static_cast(calc_type); + param_.maskType = static_cast(mask_type); + param_.calcType = static_cast(calc_type); } bool RingMLARunner::UpdateParam() { @@ -71,17 +71,17 @@ bool RingMLARunner::UpdateParam() { return false; } auto ret = internal_op_->UpdateParam(¶m_); - if (ret != internal::kInternalOk) { + if (ret != mindspore::internal::kInternalOk) { MS_LOG(ERROR) << "RingMLARunner UpdateParam failed."; return false; } return true; } -DynamicInternalOpPtr RingMLARunner::CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) { +DynamicInternalOpPtr RingMLARunner::CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) { created_flag_ = true; - return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs, outputs, param_, internal::kInternalRingMLAOpName); + return CALL_DYNAMIC_OP_INTERNAL(CreateRingMLAOp, RingMLAParam, inputs, outputs, param_, mindspore::internal::kInternalRingMLAOpName); } namespace { @@ -89,7 +89,7 @@ ms::Tensor GenAttnOutTensor(const ms::Tensor &query) { return ms::Tensor(query.d ms::Tensor GenLseOutTensor(const ms::Tensor &query, const std::optional &lse_prev, const int64_t &calc_type) { - using CalcType = internal::RingMLAParam::CalcType; + using CalcType = mindspore::internal::RingMLAParam::CalcType; bool is_ring = static_cast(calc_type) == CalcType::CALC_TYPE_DEFAULT; if (is_ring && lse_prev.has_value()) { return ms::Tensor(lse_prev.value().data_type(), lse_prev.value().shape()); diff --git a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h index 89d96d9ed..e3a6118f3 100644 --- a/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h +++ b/ccsrc/ops/ms_kernels_internal/ring_mla/ring_mla_runner.h @@ -43,12 +43,12 @@ class RingMLARunner : public InternalPyboostRunner { protected: bool UpdateParam() override; - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override; + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override; private: bool created_flag_{false}; - internal::RingMLAParam param_; + mindspore::internal::RingMLAParam param_; }; } // namespace ms_custom_ops diff --git a/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc b/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc index 942256176..e45fed0c6 100644 --- a/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc +++ b/ccsrc/ops/ms_kernels_internal/trans_data/trans_data.cc @@ -47,25 +47,25 @@ enum class InputIndex : size_t { enum class OutputIndex : size_t { kOutputIndex = 0 }; -inline DynamicInternalOpPtr CreateTransDataOpWithParam(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, +inline DynamicInternalOpPtr CreateTransDataOpWithParam(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, int32_t transdata_type) { - internal::TransDataParam param; + mindspore::internal::TransDataParam param; // Map transdata_type to internal enum and set appropriate input format auto inputs_clone = inputs; auto outputs_clone = outputs; if (transdata_type == static_cast(TransdataType::FRACTAL_NZ_TO_ND)) { - param.transdataType = internal::TransDataParam::FRACTAL_NZ_TO_ND; + param.transdataType = mindspore::internal::TransDataParam::FRACTAL_NZ_TO_ND; // For FRACTAL_NZ_TO_ND: input should be FRACTAL_NZ format - inputs_clone[0].SetFormat(internal::kFormatFRACTAL_NZ); - outputs_clone[0].SetFormat(internal::kFormatND); + inputs_clone[0].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); + outputs_clone[0].SetFormat(mindspore::internal::kFormatND); } else if (transdata_type == static_cast(TransdataType::ND_TO_FRACTAL_NZ)) { - param.transdataType = internal::TransDataParam::ND_TO_FRACTAL_NZ; + param.transdataType = mindspore::internal::TransDataParam::ND_TO_FRACTAL_NZ; // For ND_TO_FRACTAL_NZ: input should be ND format - inputs_clone[0].SetFormat(internal::kFormatND); - outputs_clone[0].SetFormat(internal::kFormatFRACTAL_NZ); + inputs_clone[0].SetFormat(mindspore::internal::kFormatND); + outputs_clone[0].SetFormat(mindspore::internal::kFormatFRACTAL_NZ); } else { MS_LOG(EXCEPTION) << "TransData: Invalid transdata_type " << transdata_type << ", valid values are: 0 (FRACTAL_NZ_TO_ND), 1 (ND_TO_FRACTAL_NZ)"; @@ -73,10 +73,10 @@ inline DynamicInternalOpPtr CreateTransDataOpWithParam(const internal::InputsImm // Note: outCrops are handled internally by the ms_kernels_internal layer // Users do not need to specify outCrops - they are auto-calculated - param.specialTransdata = internal::TransDataParam::NORMAL; + param.specialTransdata = mindspore::internal::TransDataParam::NORMAL; return CALL_DYNAMIC_OP_INTERNAL(CreateTransDataOp, TransDataParam, inputs_clone, outputs_clone, param, - internal::kInternalTransDataOpName); + mindspore::internal::kInternalTransDataOpName); } // ============================================================================= @@ -137,8 +137,8 @@ class CustomTransData : public InternalKernelMod { } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs, + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs, const std::vector &ms_inputs, const std::vector &ms_outputs) override { auto transdata_type = ms_inputs.at(static_cast(InputIndex::kTransdataTypeIndex)); @@ -174,8 +174,8 @@ class TransDataRunner : public InternalPyboostRunner { void SetTransdataType(const int32_t &transdata_type) { this->transdata_type_ = transdata_type; } protected: - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { return CreateTransDataOpWithParam(inputs, outputs, this->transdata_type_); } diff --git a/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc b/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc index 2dd552436..63a53c2e5 100644 --- a/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc +++ b/ccsrc/ops/ms_kernels_internal/type_cast/type_cast.cc @@ -130,8 +130,8 @@ public: protected: size_t CalcWorkspace() override { return 0; } - DynamicInternalOpPtr CreateKernel(const internal::InputsImmutableInfoList &inputs, - const internal::OutputsImmutableInfoList &outputs) override { + DynamicInternalOpPtr CreateKernel(const mindspore::internal::InputsImmutableInfoList &inputs, + const mindspore::internal::OutputsImmutableInfoList &outputs) override { return nullptr; } diff --git a/cmake/find_ms_internal_kernels_lib.cmake b/cmake/find_ms_internal_kernels_lib.cmake index 05f8e9216..7dbe9bd4d 100644 --- a/cmake/find_ms_internal_kernels_lib.cmake +++ b/cmake/find_ms_internal_kernels_lib.cmake @@ -37,8 +37,16 @@ endif() # Set up include paths if MindSpore path is available if(MS_PATH AND NOT MS_PATH STREQUAL "") set(INTERNAL_KERNEL_ROOT_PATH "${MS_PATH}/lib/plugin/ascend/ms_kernels_internal/internal_kernel") - set(INTERNAL_KERNEL_LIB_PATH "${MS_PATH}/lib/plugin/ascend") - set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_ROOT_PATH}" "${INTERNAL_KERNEL_ROOT_PATH}/include") + set(INTERNAL_KERNEL_LIB_PATH "${INTERNAL_KERNEL_ROOT_PATH}/lib") + # Add both MindSpore internal path and project root path for ms_kernels_internal + # The MindSpore installation path should take priority + set(INTERNAL_KERNEL_INC_PATH + "${INTERNAL_KERNEL_ROOT_PATH}/include" + "${INTERNAL_KERNEL_ROOT_PATH}" + "${CMAKE_CURRENT_SOURCE_DIR}/../../ms_kernels_internal/include" + "${CMAKE_CURRENT_SOURCE_DIR}/../../ms_kernels_internal" + "${CMAKE_CURRENT_SOURCE_DIR}/../../" + ) # Check if paths exist (non-fatal for dynamic loading) foreach(INCLUDE_PATH ${INTERNAL_KERNEL_INC_PATH}) @@ -63,9 +71,18 @@ endif() # 3. Environment variables (MINDSPORE_PATH) # 4. Common installation locations -# Make variables available to parent scope -set(INTERNAL_KERNEL_LIB_PATH "" PARENT_SCOPE) # Empty - runtime detection -set(MINDSPORE_INTERNAL_KERNELS_LIB "" PARENT_SCOPE) -set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_INC_PATH}" PARENT_SCOPE) +# Make variables available to current and parent scope +set(INTERNAL_KERNEL_LIB_PATH "${INTERNAL_KERNEL_LIB_PATH}") +set(MINDSPORE_INTERNAL_KERNELS_LIB "") +set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_INC_PATH}") + +# Also set for parent scope if it exists +# Check if we have a parent scope before setting PARENT_SCOPE variables +get_property(PARENT_DIRECTORY DIRECTORY PROPERTY PARENT_DIRECTORY) +if(PARENT_DIRECTORY) + set(INTERNAL_KERNEL_LIB_PATH "${INTERNAL_KERNEL_LIB_PATH}" PARENT_SCOPE) + set(MINDSPORE_INTERNAL_KERNELS_LIB "" PARENT_SCOPE) + set(INTERNAL_KERNEL_INC_PATH "${INTERNAL_KERNEL_INC_PATH}" PARENT_SCOPE) +endif() message(STATUS "Dynamic loading configured - library will be found at runtime") -- Gitee From f0acdd073dc47f0c4e689d98a49f5d9f17b15f32 Mon Sep 17 00:00:00 2001 From: mengyuanli Date: Wed, 10 Sep 2025 11:33:34 +0800 Subject: [PATCH 3/3] add md compile success but adapter stuck when run !! find adapter.so --- .commit_id | 44 +- ccsrc/adapter/CMakeLists.txt | 7 + ccsrc/adapter/internal_adapter_factory.cc | 65 ++ ccsrc/adapter/internal_adapter_factory.h | 24 + .../ms_kernels_internal/adapter_loader.cc | 45 +- setup.py | 19 + ...43\345\206\263\346\226\271\346\241\210.md" | 644 ++++++++++++++++++ 7 files changed, 800 insertions(+), 48 deletions(-) create mode 100644 "\345\215\225\344\276\213\345\273\266\350\277\237\345\210\235\345\247\213\345\214\226\350\247\243\345\206\263\346\226\271\346\241\210.md" diff --git a/.commit_id b/.commit_id index 81ea18bf2..151231256 100644 --- a/.commit_id +++ b/.commit_id @@ -1,44 +1,6 @@ rt_soc -commit f79ddc28 +commit 4b494dca Author: mengyuanli -Date: Mon Sep 8 14:38:51 2025 +0800 +Date: Tue Sep 9 21:51:16 2025 +0800 - dlopen ms_internal_kernels(compile success) - - use multy load - - not link and find ms_kernels_internal - - remove internal_kernels - - operator api fix - - DynamicInternalOp - - fix internal::DeviceAddressPtr to OutputsAddrList - - op_adapter - - createfunc return internal::InternalOpPtr - - go back to DynamicInternalOp, not use internal::InternalOpPtr - - Tiling bug fix - - add log for find ms lib path - - fix import bug - - add log - - use CreateKernel not use CreateInternalOp - - call cpp api; mapping cpp & c api - - replace c wrapper to call internal::InternalOp - - use c wrapper ---and --- call cpp api - - load so when init@graph setup@pynative - - fix tiling and workspace + use factory fit diff --git a/ccsrc/adapter/CMakeLists.txt b/ccsrc/adapter/CMakeLists.txt index 82b5a6d91..2002acad4 100644 --- a/ccsrc/adapter/CMakeLists.txt +++ b/ccsrc/adapter/CMakeLists.txt @@ -50,5 +50,12 @@ if(DEFINED INTERNAL_KERNEL_LIB_PATH AND INTERNAL_KERNEL_LIB_PATH) message(STATUS "ADAPTER_LIB_DIRS: ${ADAPTER_LIB_DIRS}") else() + # The adapter layer requires compile-time linking to ms_kernels_internal library + # This is intentional - the adapter serves as an intermediate layer that: + # 1. Compile-time links to libms_kernels_internal.so + # 2. Provides C interface for dynamic loading by main library + # 3. Eliminates complex symbol resolution at runtime + # If INTERNAL_KERNEL_LIB_PATH is not available, the adapter cannot be built + # and the system will fall back to pure dynamic loading mode message(FATAL_ERROR "INTERNAL_KERNEL_LIB_PATH not defined - adapter layer will not be built") endif() \ No newline at end of file diff --git a/ccsrc/adapter/internal_adapter_factory.cc b/ccsrc/adapter/internal_adapter_factory.cc index a9035f54f..39c49ee08 100644 --- a/ccsrc/adapter/internal_adapter_factory.cc +++ b/ccsrc/adapter/internal_adapter_factory.cc @@ -264,6 +264,71 @@ int GetWorkspaceSizes(void* handle, size_t* sizes, int max_count) { } } +size_t GetTilingSize(void* handle) { + if (!handle) { + return 0; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return 0; + } + + return op->GetTilingSize(); + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to get tiling size: " << e.what() << std::endl; + return 0; + } +} + +int Tiling(void* handle, void* host_addr, void** host_run_info_ptr) { + if (!handle || !host_addr || !host_run_info_ptr) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + mindspore::internal::HostRunInfoPtr* typed_ptr = + reinterpret_cast(host_run_info_ptr); + + auto status = op->Tiling(host_addr, typed_ptr); + return (status == mindspore::internal::kInternalOk) ? 0 : -1; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to perform tiling: " << e.what() << std::endl; + return -1; + } +} + +int SetTilingInfo(void* handle, void* tiling_info) { + if (!handle || !tiling_info) { + return -1; + } + + try { + auto op = ms_custom_ops::OpHandleManager::GetInstance().GetOp(handle); + if (!op) { + return -1; + } + + // Convert void* back to shared_ptr + auto typed_tiling_info = + std::shared_ptr( + static_cast(tiling_info), + [](mindspore::internal::TilingInfo*){}); // Empty deleter since we don't own this + + op->SetTilingInfo(typed_tiling_info); + return 0; + } catch (const std::exception& e) { + std::cerr << "[Adapter] Failed to set tiling info: " << e.what() << std::endl; + return -1; + } +} + void ReleaseOp(void* handle) { if (handle) { ms_custom_ops::OpHandleManager::GetInstance().UnregisterOp(handle); diff --git a/ccsrc/adapter/internal_adapter_factory.h b/ccsrc/adapter/internal_adapter_factory.h index 3b4a60755..e252f37a0 100644 --- a/ccsrc/adapter/internal_adapter_factory.h +++ b/ccsrc/adapter/internal_adapter_factory.h @@ -170,6 +170,30 @@ int LaunchOp(void* handle, */ int GetWorkspaceSizes(void* handle, size_t* sizes, int max_count); +/** + * @brief Get tiling size + * @param handle Operation handle + * @return Tiling size + */ +size_t GetTilingSize(void* handle); + +/** + * @brief Perform tiling operation + * @param handle Operation handle + * @param host_addr Host memory address for tiling + * @param host_run_info_ptr Pointer to host run info pointer + * @return 0 on success, -1 on failure + */ +int Tiling(void* handle, void* host_addr, void** host_run_info_ptr); + +/** + * @brief Set tiling info + * @param handle Operation handle + * @param tiling_info Tiling info pointer + * @return 0 on success, -1 on failure + */ +int SetTilingInfo(void* handle, void* tiling_info); + /** * @brief Release operation * @param handle Operation handle diff --git a/ccsrc/base/ms_kernels_internal/adapter_loader.cc b/ccsrc/base/ms_kernels_internal/adapter_loader.cc index 56491bd26..e3e982913 100644 --- a/ccsrc/base/ms_kernels_internal/adapter_loader.cc +++ b/ccsrc/base/ms_kernels_internal/adapter_loader.cc @@ -57,12 +57,40 @@ bool AdapterLoader::Initialize() { } bool AdapterLoader::LoadAdapter() { - // Find adapter library - std::vector search_paths = { - "./ms_custom_ops_adapter.so", - "../build/ms_custom_ops_adapter.so", - std::string(std::getenv("HOME") ? std::getenv("HOME") : ".") + "/.local/lib/ms_custom_ops_adapter.so" - }; + std::vector search_paths; + + // Method 1: Use dladdr to get current library path (similar to MindSpore) + Dl_info dl_info; + if (dladdr(reinterpret_cast(&AdapterLoader::LoadAdapter), &dl_info) != 0) { + std::string cur_so_path = dl_info.dli_fname; + auto pos = cur_so_path.find_last_of('/'); + if (pos != std::string::npos) { + std::string so_dir = cur_so_path.substr(0, pos); + search_paths.push_back(so_dir + "/libms_custom_ops_adapter.so"); + MS_LOG(INFO) << "Current library path: " << cur_so_path << ", searching in: " << so_dir; + } + } + + // Method 2: Query Python for ms_custom_ops package location + std::string python_cmd = "python3 -c \"import ms_custom_ops; import os; print(os.path.dirname(ms_custom_ops.__file__))\" 2>/dev/null"; + FILE* pipe = popen(python_cmd.c_str(), "r"); + if (pipe) { + char buffer[256]; + if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + std::string package_path(buffer); + // Remove trailing newline + if (!package_path.empty() && package_path.back() == '\n') { + package_path.pop_back(); + } + search_paths.push_back(package_path + "/libms_custom_ops_adapter.so"); + MS_LOG(INFO) << "Python package path: " << package_path; + } + pclose(pipe); + } + + // Method 3: Fallback paths + search_paths.push_back("./libms_custom_ops_adapter.so"); + search_paths.push_back("../libms_custom_ops_adapter.so"); std::string adapter_path; for (const auto& path : search_paths) { @@ -74,7 +102,10 @@ bool AdapterLoader::LoadAdapter() { } if (adapter_path.empty()) { - MS_LOG(ERROR) << "Cannot find ms_custom_ops_adapter.so"; + MS_LOG(ERROR) << "Cannot find libms_custom_ops_adapter.so in any of the search paths:"; + for (const auto& path : search_paths) { + MS_LOG(ERROR) << " " << path; + } return false; } diff --git a/setup.py b/setup.py index ee986948a..fcfdeda00 100644 --- a/setup.py +++ b/setup.py @@ -220,6 +220,23 @@ class CustomBuildExt(build_ext): so_name = os.path.basename(dst_so_path) shutil.copy(src_so_path, os.path.join(package_path, so_name)) logger.info(f"Copied {so_name} to {dst_so_path}") + + # Copy the adapter library if it exists + adapter_so_name = "libms_custom_ops_adapter.so" + adapter_src_path = os.path.join(BUILD_OPS_DIR, "adapter", adapter_so_name) + if os.path.exists(adapter_src_path): + # Copy to package build directory + adapter_dst_path = os.path.join(package_path, adapter_so_name) + shutil.copy(adapter_src_path, adapter_dst_path) + logger.info(f"Copied adapter library {adapter_so_name} to {adapter_dst_path}") + + # Also copy to Python package source directory for proper packaging + python_adapter_dst_path = os.path.join(python_package_path, adapter_so_name) + shutil.copy(adapter_src_path, python_adapter_dst_path) + logger.info(f"Copied adapter library {adapter_so_name} to Python package: {python_adapter_dst_path}") + else: + logger.info(f"Adapter library not built (INTERNAL_KERNEL_LIB_PATH may not be available). " + f"This is normal for pure dynamic loading mode. Expected path: {adapter_src_path}") # Copy generated Python files to Python package directory auto_generate_dir = os.path.join(build_extension_dir, ext_name + "_auto_generate") @@ -253,6 +270,8 @@ package_data = { ".commit_id" ], "ms_custom_ops": [ + "*.so", + "libms_custom_ops_adapter.so", "gen_ops_def.py", "gen_ops_prim.py" ] diff --git "a/\345\215\225\344\276\213\345\273\266\350\277\237\345\210\235\345\247\213\345\214\226\350\247\243\345\206\263\346\226\271\346\241\210.md" "b/\345\215\225\344\276\213\345\273\266\350\277\237\345\210\235\345\247\213\345\214\226\350\247\243\345\206\263\346\226\271\346\241\210.md" new file mode 100644 index 000000000..0761a3c19 --- /dev/null +++ "b/\345\215\225\344\276\213\345\273\266\350\277\237\345\210\235\345\247\213\345\214\226\350\247\243\345\206\263\346\226\271\346\241\210.md" @@ -0,0 +1,644 @@ +# 单例延迟初始化解决方案 + +## 1. 问题概述 + +### 1.1 核心问题 +`import ms_custom_ops`时会过早初始化ms_kernels_internal中的单例对象,导致aclrt运行时环境提前初始化,造成: +- 过早资源占用和性能影响 +- 与用户代码初始化时机冲突 +- 难以控制资源生命周期 + +### 1.2 问题根源 +位于`akg/python/ms_custom_ops/__init__.py`第22行: +```python +ctypes.CDLL(internal_lib_path) # 直接加载so,触发静态初始化 +``` + +## 2. 核心需求 + +### 2.1 功能需求 +- 延迟初始化:仅在首次使用时初始化 +- 线程安全:支持多线程并发访问 +- API兼容:保持现有接口不变 + +### 2.2 性能需求 +- 首次调用延迟增加<5ms +- 导入时间减少≥50% +- 后续调用性能无影响 + +## 3. 关键技术冲突点 + +### 3.1 编译时vs运行时依赖的根本矛盾 + +**架构问题根因**: +```cmake +# 编译时强链接导致运行时自动加载 +ms_custom_ops.so → libmindspore_internal_kernels.so (DT_NEEDED) +``` + +**依赖链分析**: +``` +编译时: ms_custom_ops.so → libmindspore_internal_kernels.so +运行时: import ms_custom_ops → 系统加载ms_custom_ops.so → 自动加载依赖库 +结果: 无法通过Python层延迟加载控制编译时强依赖 +``` + +### 3.2 gen_ops_prim循环依赖问题 + +**问题现象**: 即使Python层完全避免导入`.so`文件,系统仍会报错缺少libmindspore_internal_kernels.so + +**循环依赖链**: +``` +import ms_custom_ops 导入 __init__.py + ↓ +__init__.py 尝试导入 gen_ops_prim + ↓ +gen_ops_prim.py: from . import ms_custom_ops + ↓ +系统动态链接器加载 ms_custom_ops.so + ↓ +自动解析编译时依赖: libmindspore_internal_kernels.so +``` + +### 3.3 MindSpore静态图编译约束 + +MindSpore的`ms.jit`对动态语句(try-except)有限制,编译器会分析函数实现,如果包含try-except等动态语句,编译失败,错误信息: "Unsupported statement 'Try'" + +## 4. 解决方案:C++层动态加载 + +### 4.1 技术架构设计 + +将编译时静态链接改为运行时动态加载: + +```cpp +class DynamicInternalLoader { +private: + void* lib_handle_ = nullptr; + bool loaded_ = false; + + typedef internal::InternalOpPtr (*CreateInternalOpFunc)(...); + CreateInternalOpFunc create_internal_op_func_ = nullptr; + +public: + bool LoadLibrary() { + if (loaded_) return true; + + lib_handle_ = dlopen("libmindspore_internal_kernels.so", RTLD_LAZY); + if (!lib_handle_) return false; + + create_internal_op_func_ = (CreateInternalOpFunc)dlsym(lib_handle_, "CreateInternalOp"); + if (!create_internal_op_func_) { + dlclose(lib_handle_); + return false; + } + + loaded_ = true; + return true; + } + + internal::InternalOpPtr CreateInternalOp(...) { + if (!loaded_ && !LoadLibrary()) { + throw std::runtime_error("Failed to load internal library"); + } + return create_internal_op_func_(...); + } +}; +``` + +### 4.2 实施步骤 + +**步骤1: 修改CMake配置** +```cmake +# 修改 akg/ccsrc/CMakeLists.txt +# 原来: ldflags='-L${INTERNAL_KERNEL_LIB_PATH} -l${LIBS}' +# 修改为: ldflags='-ldl' # 只链接动态加载库 +``` + +**步骤2: 创建动态加载包装器** +```cpp +// 新文件: akg/ccsrc/base/dynamic_internal_loader.h +class DynamicInternalLoader { + // 实现dlopen方式的符号解析 +}; +``` + +**步骤3: 重构基类和算子实现** +- 创建InternalKernelModDynamic基类 +- 移除对internal.h头文件的直接依赖 +- 使用动态加载包装器替代直接库调用 + +### 4.3 实施状态 + +#### 已完成组件 + +1. **DynamicInternalLoader** - 动态加载包装器 + - 单例模式确保全局唯一 + - 线程安全的库加载管理 + - dlopen/dlsym API封装 + - 完整的错误处理和状态报告 + +2. **InternalKernelModDynamic** - 动态基类 + - 替代原有InternalKernelMod + - 移除编译时头文件依赖 + - 集成动态加载逻辑 + - 保持API兼容性 + +3. **InternalFunctionsWrapper** - 函数包装器 + - 封装所有需要动态加载的函数 + - 统一的初始化、错误处理和状态管理 + +4. **CMake配置修改** + - 移除静态链接 `-l${LIBS}` + - 添加动态加载支持 `-ldl` + - 保留必要的头文件路径 + +#### 头文件依赖解决方案 + +通过创建独立类型定义文件替代头文件依赖: + +```cpp +namespace ms_custom_ops { +namespace internal_types { + using ShapeInfo = std::vector; + + enum class DataType : int32_t { + kTypeUnknown = 0, + kTypeBool = 1, + kTypeInt8 = 2, + }; + + enum class TensorFormat : int32_t { + kFormatND = 0, + kFormatNHWC = 1, + }; +} +} +``` + +### 4.4 遇到的问题和解决过程 + +#### 4.4.1 V1方案:set_exception_func_ is null 问题 + +**问题现象**: +``` +执行时先打印: "Failed to load from: xxx" (L75 DEBUG) +然后打印: "Successfully loaded library from: xxx" (L72 INFO) +但后续执行报错: "set_exception_func_ is null" +``` + +**根因分析**: +- `libms_kernels_internal.so` 确实加载成功 +- `CreateApplyRotaryPosEmbOp` 等函数符号存在且可调用 +- 问题在于架构设计缺陷 + +**架构设计缺陷发现**: +当前实现试图调用不存在的C风格包装器函数: +```cpp +// 尝试调用不存在的函数 +typedef int (*InitOpFunc)(void* op_handle); +auto init_func = loader.GetOpCreateFunction("InternalOpInit"); // 此函数不存在 +``` + +**实际库接口分析**: +```bash +# 实际库中的符号 (通过nm -D分析) +mindspore::internal::InternalOp::Init() ✓ 存在 +mindspore::internal::InternalOp::Launch(...) ✓ 存在 +mindspore::internal::CreateApplyRotaryPosEmbOp(...) ✓ 存在 + +# 不存在的包装器函数 +InternalOpInit ✗ 不存在 +InternalOpUpdateParam ✗ 不存在 +InternalOpLaunch ✗ 不存在 +``` + +#### 4.4.2 V2方案:接口签名和返回类型错误 + +**当前宏定义错误**: +```cpp +typedef void *(*FUNC_NAME##_Type)(...); // 返回void* +``` + +**实际函数签名**: +```cpp +// 通过c++filt解析得到的真实签名 +mindspore::internal::InternalOp* CreateApplyRotaryPosEmbOp( + std::vector const&, + std::vector const&, + mindspore::internal::ApplyRotaryPosEmbParam const&, + std::string const& +); // 返回 InternalOp*,不是void* +``` + +**修复方案**: +修正返回类型为智能指针: +```cpp +typedef std::shared_ptr (*FUNC_NAME##_Type)(...); +``` + +#### 4.4.3 V3方案:C++符号解析问题 + +**问题发现**: +在其他环境测试时发现 `Failed to load CreateApplyRotaryPosEmbOp function` 错误。 + +**根因分析**: +- 代码尝试通过C风格符号名 `"CreateApplyRotaryPosEmbOp"` 查找函数 +- 但实际库中只有C++ mangled符号:`_ZN9mindspore8internal25CreateApplyRotaryPosEmbOp...` +- 不同编译器的mangling规则不同 + +**解决方案**: +实现通用符号解析: +```cpp +std::string GetMangledSymbolName(const std::string& func_name) { + // 1. 尝试直接符号名查找 + if (loader.GetFunctionPointer(func_name)) return func_name; + + // 2. 运行时符号发现 + return loader.FindMangledSymbol(func_name); +} +``` + +#### 4.4.4 V4方案:构造函数参数不匹配 + +**编译错误**: +```cpp +// 编译错误: no matching function for call ms_custom_ops::DynamicInternalOp +return CALL_DYNAMIC_OP_INTERNAL(CreateMlaPreprocessOp, ...); +``` + +**根本原因**: +架构不一致 - 新宏返回智能指针但构造函数期望void*句柄: +```cpp +// 新宏设计 +auto op_ptr = create_func(...); // std::shared_ptr +result = std::make_shared(op_ptr); // 传递智能指针 + +// 旧类实现 +DynamicInternalOp(void* op_handle) : op_handle_(op_handle) {} // 期望void* +``` + +**修复方案**: +统一使用智能指针: +```cpp +class DynamicInternalOp { +public: + DynamicInternalOp(std::shared_ptr op_ptr) : op_ptr_(op_ptr) {} +private: + std::shared_ptr op_ptr_; +}; +``` + +#### 4.4.5 编译时符号依赖问题(最终难题) + +**致命问题发现**: +```python +from .ms_custom_ops import * +# 报错: mindspore::internal::InternalOp::Init() 符号缺失 +``` + +**根因确认**: +直接调用 `op_ptr_->Init()` 会产生编译时符号依赖: +```cpp +internal::InternalStatus Init() { + return op_ptr_->Init(); // 链接器在编译时寻找 InternalOp::Init 符号 +} +``` + +**后果分析**: +1. 编译时:`ms_custom_ops.so`必须能找到`InternalOp::Init()`符号 +2. 链接依赖:违背延迟加载目标,产生`DT_NEEDED`依赖 +3. 部署失败:`import ms_custom_ops`时立即报符号缺失 + +#### 4.4.6 最终方案:Mangled符号直接调用 + +**核心发现**: +既然内部库提供了实际的C++方法符号,可以运行时通过mangled符号名直接调用: + +**方案设计**: +```cpp +internal::InternalStatus Init() { + // 编译时:无任何InternalOp符号引用 + typedef internal::InternalStatus (*InitFunc)(void*); + std::string mangled_name = "_ZN9mindspore8internal10InternalOp4InitEv"; + auto init_func = loader.GetOpCreateFunction(mangled_name); + + // 运行时:直接调用真实的C++方法 + return init_func(op_ptr_.get()); +} +``` + +**关键符号映射**: +- `Init()`: `_ZN9mindspore8internal10InternalOp4InitEv` +- `UpdateParam()`: `_ZN9mindspore8internal10InternalOp11UpdateParamEPKv` +- `Launch()`: `_ZN9mindspore8internal10InternalOp6LaunchERKSt6vectorIPvSaIS3_EES7_S7_S3_RKSs` + +### 4.5 当前状态总结 + +经过多轮迭代,发现了延迟加载的根本挑战: +1. **C包装器不存在** - 内部库没有提供C风格的包装器函数 +2. **直接C++调用产生编译依赖** - 违背了延迟加载的核心原则 +3. **Mangled符号方案理论可行** - 但需要处理编译器兼容性问题 + +**当前实施文件清单**: +``` +akg/ccsrc/base/ +├── dynamic_internal_loader.{h,cc} # 动态加载包装器 +└── ms_kernels_internal/ + ├── internal_functions_wrapper.{h,cc} # 动态加载包装器 + ├── internal_types.h # 独立类型定义 + ├── internal_helper.{h,cc} # 无头文件依赖版本 + ├── dynamic_op_helper.{h,cc} # 最终方案实现 + └── graphmode/ + └── internal_kernel_mod_dynamic.{h,cc} # 完整业务逻辑 +``` + +## 5. 验收标准 + +### 5.1 构建验证 +- readelf -d ms_custom_ops.so 中不再出现对 libmindspore_internal_kernels.so 的 DT_NEEDED + +### 5.2 功能验收 +- `import ms_custom_ops`不触发aclrt初始化 +- 首次算子调用时正常初始化 +- 所有现有功能测试通过 + +### 5.3 性能验收 +- 导入时间减少≥50% +- 首次调用延迟增加<5ms +- 后续调用性能无影响 + +### 5.4 稳定性验收 +- 并发场景下仅初始化一次 +- 异常路径错误处理完善 +- 多次 Init/Shutdown 循环正常 + +## 6. 部署建议 + +### 6.1 编译验证 +```bash +cd /home/lmy/custom_op/akg +bash build.sh # 使用修改后的CMakeLists.txt构建 +``` + +### 6.2 功能测试 +```python +# 测试动态加载是否工作 +import ms_custom_ops +status = ms_custom_ops._get_load_status() +print(f"库是否在导入时加载: {status['loaded']}") # 应该为False + +# 测试首次使用是否触发加载 +result = ms_custom_ops.type_cast_dynamic(tensor, dtype) +print("首次调用成功") +``` + +### 6.3 回滚方案 +如需回滚到原始实现: +```bash +cd /home/lmy/custom_op/akg/ccsrc +mv CMakeLists.txt.backup CMakeLists.txt +# 然后重新构建 +``` + +## 7. MindSpore官方集成方案参考 + +### 7.1 MindSpore的双重策略架构 + +通过分析MindSpore源码发现,官方采用了一个**编译时链接 + 运行时动态加载**的混合策略: + +#### 7.1.1 编译时动态链接 +```cmake +# mindspore/mindspore/ops/kernel/ascend/internal/CMakeLists.txt +if(DEFINED ENV{MS_INTERNAL_KERNEL_HOME}) + include_directories($ENV{MS_INTERNAL_KERNEL_HOME}/internal_kernel) + link_directories($ENV{MS_INTERNAL_KERNEL_HOME}/internal_kernel/lib) + + add_library(mindspore_internal_kernels SHARED ${INTERNAL_SRC_LIST}) + target_link_libraries(mindspore_internal_kernels PRIVATE mindspore_ascend ms_kernels_internal) +endif() +``` + +#### 7.1.2 运行时动态加载 +```cpp +// mindspore/mindspore/ops/kernel/ascend/internal/internal_kernel_build.cc +std::shared_ptr GetKernelPLugin() { + // 动态加载已经集成的so文件 + auto targe_so_path = cur_so_path + "/ascend/" + "libmindspore_internal_kernels.so"; + ret = plugin_loader::PluginLoader::LoadDynamicLib(targe_so_path, &plugin_maps, &dlopen_error_msg); + + // 创建插件对象 + k_internal_kernel_plugin_ptr = Factory::Instance().Create("InternalKernelPlugin"); +} +``` + +### 7.2 架构对比分析 + +#### MindSpore的分层架构 +``` +MindSpore Core + ↓ (dlopen) +libmindspore_internal_kernels.so ← 插件层(编译时链接了内部库) + ↓ (编译时链接) +libms_kernels_internal.so ← 内核实现层 +``` + +#### 我们的纯动态加载架构 +``` +ms_custom_ops + ↓ (dlopen + dlsym) +libms_kernels_internal.so ← 直接动态加载内核层 +``` + +### 7.3 MindSpore方案的优势 + +1. **避免符号解析问题**: `libmindspore_internal_kernels.so`已包含所有符号,无需运行时mangled符号查找 +2. **简化调用链**: 不需要复杂的函数指针转换和动态符号解析 +3. **保持类型安全**: 编译时确定所有函数签名,避免void*转换 +4. **单次加载**: 只需一次dlopen,避免多次符号查找开销 +5. **模块化**: MindSpore核心仍与内部库解耦,支持可选加载 + +### 7.4 对我们方案的启示 + +考虑采用类似的**两层架构**优化: + +#### 优化方案设计 +1. **创建中间层**: 构建 `libms_custom_ops_internal.so` +2. **动态链接**: 该中间层编译时动态链接 `libms_kernels_internal.so` +3. **延迟加载**: Python层延迟加载中间层,而非直接加载内部库 + +#### 潜在收益 +- 消除复杂的mangled符号解析 +- 避免函数指针类型转换错误 +- 利用编译时符号解析和类型检查 +- 保持延迟加载的模块化优势 +- 通过RPATH确保依赖库的正确加载 + +#### 实施考虑 +- 需要额外的构建步骤和中间层维护 +- 增加了部署复杂度 +- 但显著降低了运行时的技术风险 + +这个参考为我们提供了一个经过验证的、更稳定的集成模式选择。 + +## 8. MindSpore插件架构深度分析 + +### 8.1 符号映射机制的重要发现 + +通过深入分析MindSpore源码发现,**MindSpore插件架构并没有复杂的符号映射机制**,而是采用了一个**工厂注册模式**。 + +#### 8.1.1 核心发现 + +**传统认知误区**: +```cpp +// 错误假设:需要复杂的符号解析 +dlopen("libmindspore_internal_kernels.so"); +dlsym(handle, "_ZN9mindspore8internal25CreateApplyRotaryPosEmbOp..."); // 不需要! +``` + +**实际MindSpore机制**: +```cpp +// 实际机制:简单的工厂模式 +dlopen("libmindspore_internal_kernels.so"); // 加载时自动注册 +Factory::Instance().Create("InternalKernelPlugin"); // 直接创建 +``` + +#### 8.1.2 工厂注册流程 + +```cpp +// mindspore/mindspore/ops/kernel/ascend/internal/internal_kernel_build.cc:53 +k_internal_kernel_plugin_ptr = Factory::Instance().Create("InternalKernelPlugin"); +``` + +**关键优势**: +1. **无符号解析**: 不需要dlsym查找mangled符号 +2. **自动注册**: 插件库加载时静态初始化代码自动注册到工厂 +3. **字符串查找**: 通过简单的字符串名称创建对象 +4. **类型安全**: 编译时确定所有类型,无需void*转换 + +#### 8.1.3 工厂模式核心代码 + +```cpp +// mindspore/mindspore/ccsrc/include/runtime/hardware_abstract/kernel_base/ms_factory.h +template +class Factory : public FactoryBase { +public: + static Factory &Instance() { + std::string key = typeid(C).name(); + FactoryBase *instance = FactoryBase::GetInstance(key); + if (instance == nullptr) { + FactoryBase::CreateFactory(key, std::make_unique>()); + instance = FactoryBase::GetInstance(key); + } + return *static_cast *>(instance); + } + + C *Create(const std::string &name) { + auto iter = kernel_mod_creators_.find(name); + if (iter != kernel_mod_creators_.end()) { + return iter->second(); + } + return nullptr; + } +}; +``` + +### 8.2 对我们方案的启示 + +#### 8.2.1 简化设计策略 + +基于MindSpore的工厂模式,我们可以设计**极简适配器**: + +```cpp +namespace ms_custom_ops_adapter { + +// 工厂注册接口 +class InternalOpFactory { +public: + static InternalOpFactory& GetInstance(); + + // 注册操作创建函数 + void RegisterOpCreator(const std::string& name, + std::function creator); + + // 创建操作实例 + void* CreateOp(const std::string& name); +}; + +// 适配器接口(替代复杂的符号映射) +extern "C" { + // 初始化适配器(注册所有操作) + void InitInternalAdapter(); + + // 简单工厂接口 + void* CreateInternalOp(const char* op_name); + int CallOpMethod(void* handle, const char* method_name, void* args); + void ReleaseOp(void* handle); +} + +} // namespace ms_custom_ops_adapter +``` + +#### 8.2.2 实施优势 + +**技术简化**: +- ❌ 移除复杂的mangled符号处理 +- ❌ 移除函数指针类型转换 +- ❌ 移除编译器兼容性问题 +- ✅ 使用简单的字符串名字查找 +- ✅ 使用标准工厂模式 +- ✅ 使用自动注册机制 + +**架构对比**: +``` +原V4方案(复杂): +ms_custom_ops → dlsym("_ZN...") → 类型转换 → 调用 + +新工厂方案(简单): +ms_custom_ops → dlopen(adapter) → Factory.Create("OpName") → 调用 +``` + +### 8.3 最终推荐方案 + +#### 8.3.1 三层架构设计 + +``` +ms_custom_ops (主库) + ↓ dlopen延迟加载 +ms_custom_ops_adapter.so (适配器层) + ↓ 编译时链接 +libms_kernels_internal.so (内核层) +``` + +#### 8.3.2 核心实现 + +**适配器层**: +```cpp +// 编译时链接内核库,提供工厂接口 +class AdapterFactory { + void RegisterInternalOps(); // 静态初始化时注册所有操作 +public: + std::shared_ptr CreateOp(const std::string& name, ...); +}; +``` + +**主库层**: +```cpp +// 运行时动态加载适配器,调用工厂接口 +class LazyInternalLoader { + void* adapter_handle_ = nullptr; + CreateOpFunc create_op_func_ = nullptr; + +public: + bool LoadAdapter(); // 延迟加载适配器 + DynamicInternalOpPtr CreateOp(const std::string& name, ...); +}; +``` + +### 8.4 方案收益总结 + +1. **技术风险极低**: 无复杂符号处理,使用成熟的工厂模式 +2. **维护成本可控**: 新增操作只需在适配器层注册 +3. **性能表现优异**: 单次dlopen,直接工厂调用 +4. **延迟加载完整**: 完全避免导入时初始化 + +这个基于MindSpore工厂模式的方案,提供了一个**技术简单、风险可控、性能优异**的延迟初始化解决路径。 \ No newline at end of file -- Gitee