From 4dea4cbce6a383176a492d99cd82a27b61540d28 Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Tue, 15 Oct 2024 11:32:06 +0800 Subject: [PATCH] Optimize build pipeline --- .gitignore | 4 +- CMakeLists.txt | 223 +----------------- CMakePresets.json | 4 +- MANIFEST.in | 1 - bind/CMakeLists.txt | 5 + bind/pybind.cpp | 1 - ci/build.sh | 38 +-- cmake/config.cmake | 6 +- cmake/func.cmake | 136 +++++------ cmake/stage_0.cmake | 11 + cmake/stage_1.cmake | 212 +++++++++++++++++ cmake/stage_2.cmake | 48 ++++ cmake/util/ascendc_bin_param_build.py | 2 + cmake/util/const_var.py | 1 + docs/api/README.md | 4 +- include/csrc/pybind.h | 2 +- .../bevformer/dense_heads/panoseg_occ_head.py | 8 +- mx_driving/__init__.py | 4 +- mx_driving/common/CMakeLists.txt | 7 + mx_driving/common/ops/assign_score_withk.py | 11 +- mx_driving/common/ops/csrc/CMakeLists.txt | 5 + mx_driving/common/ops/hypot.py | 14 +- mx_driving/common/ops/knn.py | 6 +- mx_driving/common/ops/npu_hypot.py | 18 ++ .../common/ops/npu_scatter_mean_grad.py | 4 +- mx_driving/common/ops/scatter_max.py | 6 +- mx_driving/common/ops/scatter_mean.py | 4 +- mx_driving/common/ops/sort_pairs.py | 6 +- mx_driving/common/ops/threeNN.py | 6 +- mx_driving/common/ops/three_interpolate.py | 6 +- mx_driving/detection/CMakeLists.txt | 6 +- mx_driving/detection/ops/border_align.py | 35 ++- mx_driving/detection/ops/box_iou.py | 21 +- mx_driving/detection/ops/boxes_overlap_bev.py | 5 +- mx_driving/detection/ops/csrc/CMakeLists.txt | 5 + mx_driving/detection/ops/nms3d_normal.py | 4 +- mx_driving/detection/ops/npu_nms3d.py | 4 +- mx_driving/detection/ops/roi_align_rotated.py | 6 +- mx_driving/detection/ops/rotated_iou.py | 4 +- mx_driving/detection/ops/rotated_overlaps.py | 4 +- mx_driving/fused/CMakeLists.txt | 6 +- mx_driving/fused/ops/csrc/CMakeLists.txt | 5 + mx_driving/fused/ops/deform_conv2d.py | 6 +- mx_driving/fused/ops/fused_bias_leaky_relu.py | 4 +- .../fused/ops/modulated_deform_conv2d.py | 6 +- mx_driving/fused/ops/npu_add_relu.py | 6 +- .../fused/ops/npu_deformable_aggregation.py | 6 +- mx_driving/fused/ops/npu_max_pool2d.py | 4 +- ...pu_multi_scale_deformable_attn_function.py | 6 +- mx_driving/point/CMakeLists.txt | 8 + mx_driving/point/ops/bev_pool.py | 7 +- mx_driving/point/ops/bev_pool_v2.py | 41 ++-- mx_driving/point/ops/csrc/CMakeLists.txt | 5 + .../point/ops/furthest_point_sampling.py | 4 +- .../ops/furthest_point_sampling_with_dist.py | 4 +- mx_driving/point/ops/group_points.py | 6 +- mx_driving/point/ops/npu_dynamic_scatter.py | 12 +- mx_driving/point/ops/voxel_pooling_train.py | 6 +- mx_driving/point/ops/voxelization.py | 6 +- mx_driving/preprocess/CMakeLists.txt | 6 +- mx_driving/preprocess/ops/csrc/CMakeLists.txt | 5 + .../preprocess/ops/npu_points_in_box.py | 4 +- .../preprocess/ops/npu_points_in_box_all.py | 4 +- .../preprocess/ops/npu_roipoint_pool3d.py | 4 +- mx_driving/spconv/CMakeLists.txt | 8 + mx_driving/spconv/ops/csrc/CMakeLists.txt | 5 + mx_driving/spconv/ops/sparse_functional.py | 20 +- setup.py | 213 ++++++++++++++--- tests/torch/test_bev_pool_v2.py | 32 ++- tests/torch/test_furthest_point_sampling.py | 3 +- tests/torch/test_group_points_grad.py | 5 +- tests/torch/test_hard_voxelize.py | 4 +- tests/torch/test_npu_dynamic_scatter.py | 8 +- tests/torch/test_point_to_voxel.py | 4 +- tests/torch/test_unique_voxel.py | 10 +- tests/torch/test_vec_pool_backward.py | 4 +- tests/torch/test_voxel_pooling_train.py | 64 +++-- tests/torch/test_voxel_to_point.py | 4 +- utils/extension.py | 85 ------- 79 files changed, 869 insertions(+), 668 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 bind/CMakeLists.txt create mode 100644 cmake/stage_0.cmake create mode 100644 cmake/stage_1.cmake create mode 100644 cmake/stage_2.cmake create mode 100644 mx_driving/common/ops/csrc/CMakeLists.txt create mode 100644 mx_driving/common/ops/npu_hypot.py create mode 100644 mx_driving/detection/ops/csrc/CMakeLists.txt create mode 100644 mx_driving/fused/ops/csrc/CMakeLists.txt create mode 100644 mx_driving/point/ops/csrc/CMakeLists.txt create mode 100644 mx_driving/preprocess/ops/csrc/CMakeLists.txt create mode 100644 mx_driving/spconv/ops/csrc/CMakeLists.txt delete mode 100644 utils/extension.py diff --git a/.gitignore b/.gitignore index 414718b3..35e70a2c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ __pycache__/ .DS_Store .idea -cmake-build-debug \ No newline at end of file +cmake-build-debug +build +*.egg-info/ diff --git a/CMakeLists.txt b/CMakeLists.txt index a7ee7922..f58f410e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,11 @@ -cmake_minimum_required(VERSION 3.16.0) -project(opp) -set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) +cmake_minimum_required(VERSION 3.19.0) +project(mx_driving) include(cmake/config.cmake) include(cmake/func.cmake) include(cmake/intf.cmake) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/bind) set(MX_DRIVING_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mx_driving) add_subdirectory(${MX_DRIVING_DIR}/common) add_subdirectory(${MX_DRIVING_DIR}/preprocess) @@ -14,215 +14,10 @@ add_subdirectory(${MX_DRIVING_DIR}/point) add_subdirectory(${MX_DRIVING_DIR}/detection) add_subdirectory(${MX_DRIVING_DIR}/spconv) -opbuild(OPS_SRC ${ASCEND_HOST_SRC} OUT_DIR ${ASCEND_AUTOGEN_PATH}) - -add_library(cust_op_proto SHARED ${ASCEND_HOST_SRC} - ${ASCEND_AUTOGEN_PATH}/op_proto.cc) -target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) -target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden) -target_link_libraries( - cust_op_proto - PRIVATE intf_pub - exe_graph - register - tiling_api - -Wl,--whole-archive - rt2_registry - -Wl,--no-whole-archive) -set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0) -install_target( - TRG cust_op_proto DST - packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) -install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST - packages/vendors/${vendor_name}/op_proto/inc) - -add_library(cust_optiling SHARED ${ASCEND_HOST_SRC}) -target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) -target_compile_options(cust_optiling PRIVATE -fvisibility=hidden) -target_link_libraries( - cust_optiling - PRIVATE intf_pub - exe_graph - register - tiling_api - -Wl,--whole-archive - rt2_registry - -Wl,--no-whole-archive) -set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0) -install_target( - TRG - cust_optiling - DST - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR} -) -# create liboptiling.so link -add_custom_command( - TARGET cust_optiling - POST_BUILD - COMMAND - ${CMAKE_COMMAND} -E chdir - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling - ${CMAKE_COMMAND} -E create_symlink - lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ - liboptiling.so) -install( - FILES - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so - DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) - -if(${ENABLE_ONNX}) - if(CANN_PATHS) - if(${ARCH} STREQUAL "aarch64") - protobuf_generate( - PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - else() - protobuf_generate( - PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - endif() - else() - protobuf_generate( - PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto - OUT_DIR ${ASCEND_AUTOGEN_PATH}) - endif() - - add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC}) - target_compile_options( - cust_onnx_parsers - PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private - "-fno-common" "-fno-strict-aliasing") - target_link_libraries(cust_onnx_parsers PRIVATE intf_pub) - target_include_directories( - cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include - ${ASCEND_AUTOGEN_PATH}) - - install_target(TRG cust_onnx_parsers DST - packages/vendors/${vendor_name}/framework/onnx/) -endif() - -# ===================Build ACLNN=================== -file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) -file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) -set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM}) -set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM}) -add_library(cust_opapi SHARED ${ACLNN_SRC}) -target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi) -install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib) -install_file(TRG cust_opapi SRC ${ACLNN_INC} DST - packages/vendors/${vendor_name}/op_api/include) - -# ===================Build Kernel=================== -# set custom compile options -if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") - add_ops_compile_options(ALL OPTIONS -g -O0) -endif() - -file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH}) - -foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) - if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini) - # generate aic-${compute_unit}-ops-info.json - add_ops_info_target( - TARGET - ops_info_gen_${compute_unit} - OUTPUT - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - INSTALL_DIR - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} - ) - - # generate ascendc impl py once - if(NOT TARGET ascendc_impl_gen) - add_ops_impl_target( - TARGET - ascendc_impl_gen - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - IMPL_DIR - ${ASCEND_KERNEL_PATH} - OUT_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl - ) - install_file( - TRG - ascendc_impl_gen - SRC - ${ASCEND_KERNEL_SRC} - DST - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic - ) - endif() - - # dynamic shape binary compile - if(${ENABLE_BINARY_PACKAGE}) - add_bin_compile_target( - TARGET - ascendc_bin_${compute_unit} - OPS_INFO - ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini - IMPL_DIR - ${ASCEND_KERNEL_PATH} - ADP_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic - OUT_DIR - ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} - KERNEL_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel - INSTALL_DIR - packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel - COMPUTE_UNIT - ${compute_unit}) - add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen cust_optiling) - endif() - endif() -endforeach() - -# generate npu_supported_ops.json -add_npu_support_target( - TARGET - npu_supported_ops - OPS_INFO_DIR - ${ASCEND_AUTOGEN_PATH} - OUT_DIR - ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core - INSTALL_DIR - packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}) - -# ===================Build test=================== -# WARN: WIP -if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) - add_subdirectory(testcases) -endif() - -get_system_info(SYSTEM_INFO) - -# gen version.info -add_custom_target( - gen_version_info ALL - COMMAND - bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh - ${ASCEND_CANN_PACKAGE_PATH} ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}) - -install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info - DESTINATION packages/vendors/${vendor_name}) - -if(COMPILE_OPP_PACKAGE) - # CPack config - set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) - set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) - set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") - set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") - set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) - set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") - set(CPACK_GENERATOR External) - set(CPACK_CMAKE_GENERATOR "Unix Makefiles") - set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) - set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) - set(CPACK_EXTERNAL_BUILT_PACKAGES - ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME} - ) - include(CPack) +if(BUILD_STAGE EQUAL 0) + include(cmake/stage_0.cmake) +elseif(BUILD_STAGE EQUAL 1) + include(cmake/stage_1.cmake) +elseif(BUILD_STAGE EQUAL 2) + include(cmake/stage_2.cmake) endif() diff --git a/CMakePresets.json b/CMakePresets.json index dd0b3d58..abe6215a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -11,7 +11,7 @@ "displayName": "Default Config", "description": "Default build using Unix Makefiles generator", "generator": "Unix Makefiles", - "binaryDir": "${sourceDir}/build_out", + "binaryDir": "${sourceDir}/build", "cacheVariables": { "CMAKE_BUILD_TYPE": { "type": "STRING", @@ -43,7 +43,7 @@ }, "CMAKE_INSTALL_PREFIX": { "type": "PATH", - "value": "${sourceDir}/build_out" + "value": "${sourceDir}/build" }, "ENABLE_ONNX": { "type": "BOOL", diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 3450ea38..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include mx_driving/packages/ * diff --git a/bind/CMakeLists.txt b/bind/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/bind/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/bind/pybind.cpp b/bind/pybind.cpp index b12f0644..a227bee8 100644 --- a/bind/pybind.cpp +++ b/bind/pybind.cpp @@ -1,5 +1,4 @@ #include "csrc/pybind.h" - #include #include diff --git a/ci/build.sh b/ci/build.sh index 2dc41f8a..f0d8254e 100644 --- a/ci/build.sh +++ b/ci/build.sh @@ -15,8 +15,7 @@ function check_python_version() { return 0 fi done - if [ "${matched_py_version}" = 'false' ]; then - echo "${PY_VERSION} is an unsupported python version, we suggest ${SUPPORTED_PY_VERSION[*]}" + if [ "${matched_py_version}" = 'false' ]; then echo "${PY_VERSION} is an unsupported python version, we suggest ${SUPPORTED_PY_VERSION[*]}" exit 1 fi } @@ -67,40 +66,7 @@ function main() export BUILD_PYTHON_VERSION=${PY_VERSION} rm -rf ${BUILD_PACKAGES_DIR} - if [ "x${SINGLE_OP}" != "x" ]; then - if [ -z "$ASCEND_CUSTOM_OPP_PATH" ]; then - echo "ASCEND_CUSTOM_OPP_PATH is not set. Please set the path of the custom op kernel code." - exit 1 - fi - bash ${SCRIPTS_DIR}/build_kernel.sh --single_op=${SINGLE_OP} --build_type=${BUILD_TYPE} - - if [ $? != 0 ]; then - echo "Failed to compile the wheel file. Please check the source code by yourself." - exit 1 - fi - - echo "Successfully compiled the single op: ${SINGLE_OP}" - echo "copying the custom op kernel code to the custom opp path: ${ASCEND_CUSTOM_OPP_PATH}" - cp -ruf ${BUILD_PACKAGES_DIR}/vendors/customize/op_impl/ai_core/tbe/kernel/* ${ASCEND_CUSTOM_OPP_PATH}/op_impl/ai_core/tbe/kernel/ - exit 0 - else - bash ${SCRIPTS_DIR}/build_kernel.sh --build_type=${BUILD_TYPE} - fi - - if [ $? != 0 ]; then - echo "Failed to compile the wheel file. Please check the source code by yourself." - exit 1 - fi - cd ${CUR_DIR}/.. - rm -rf build - if [ -d "mx_driving.egg-info" ]; then - echo "mx_driving.egg-info exist" - rm -rf mx_driving.egg-info - else - echo "mx_driving.egg-info not exist" - fi - - python"${PY_VERSION}" setup.py build bdist_wheel + python"${PY_VERSION}" setup.py bdist_wheel if [ $? != 0 ]; then echo "Failed to compile the wheel file. Please check the source code by yourself." exit 1 diff --git a/cmake/config.cmake b/cmake/config.cmake index d4fe96a2..eef45176 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -29,7 +29,7 @@ if(EXISTS ${ASCEND_PATH}/latest/compiler) endif() if("${CANN_PATHS}x" STREQUAL "x") - # read vertion from `latest/version.cfg` + # read version from `latest/version.cfg` file(READ "${ASCEND_PATH}/latest/version.cfg" ASCEND_VERSION_CFG) string(REGEX MATCH "(CANN-[0-9]\.[0-9]+)\]\n$" _ ${ASCEND_VERSION_CFG}) message(STATUS "ASCEND_VERSION: ${CMAKE_MATCH_1}") @@ -59,7 +59,9 @@ set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler) set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin) set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen) set(ASCEND_KERNEL_PATH ${CMAKE_BINARY_DIR}/kernels) -set(MX_DRIVING_PATH ${PROJECT_SOURCE_DIR}/mx_driving) +set(ASCEND_CSRC_SRC + "" + CACHE STRING "csrc source files") set(ASCEND_HOST_SRC "" CACHE STRING "host source files") diff --git a/cmake/func.cmake b/cmake/func.cmake index 3f532ae4..9acc3038 100644 --- a/cmake/func.cmake +++ b/cmake/func.cmake @@ -1,9 +1,8 @@ function(install_target) cmake_parse_arguments(INSTALL_TARGET "" "DST;TRG" "" ${ARGN}) set_target_properties( - ${INSTALL_TARGET_TRG} - PROPERTIES LIBRARY_OUTPUT_DIRECTORY - ${MX_DRIVING_PATH}/${INSTALL_TARGET_DST}) + ${INSTALL_TARGET_TRG} PROPERTIES LIBRARY_OUTPUT_DIRECTORY + ${MX_DRIVING_PATH}/${INSTALL_TARGET_DST}) install(TARGETS ${INSTALL_TARGET_TRG} LIBRARY DESTINATION ${INSTALL_TARGET_DST}) endfunction() @@ -41,33 +40,31 @@ function(opbuild) "OPS_SRC" ${ARGN}) set(CANN_INCLUDE_PATH "") set(CANN_LIB_PATH "") - ## if the CANN_PATHS not empty + # if the CANN_PATHS not empty if(CANN_PATHS) - ## if the arch is aarch64, add the include path + # if the arch is aarch64, add the include path if(${ARCH} STREQUAL "aarch64") set(CANN_INCLUDE_PATH ${CANN_PATHS}/aarch64-linux/include) set(CANN_LIB_PATH ${CANN_PATHS}/aarch64-linux/lib64) - else () + else() set(CANN_INCLUDE_PATH ${CANN_PATHS}/x86_64-linux/include) set(CANN_LIB_PATH ${CANN_PATHS}/x86_64-linux/lib64) endif() endif() if(NOT EXISTS ${CANN_INCLUDE_PATH}) - message(FATAL_ERROR "CANN include path not found: ${CANN_PATHS}") + message(FATAL_ERROR "CANN include path not found: ${CANN_PATHS}") endif() - if(NOT EXISTS ${CANN_LIB_PATH}) - message(FATAL_ERROR "CANN lib path not found: ${CANN_PATHS}") + if(NOT EXISTS ${CANN_LIB_PATH}) + message(FATAL_ERROR "CANN lib path not found: ${CANN_PATHS}") endif() message(STATUS "CANN include path: ${CANN_INCLUDE_PATH}") message(STATUS "CANN lib path: ${CANN_LIB_PATH}") # filter single op - if (NOT "${SINGLE_OP}x" STREQUAL "x") - list(FILTER OPBUILD_OPS_SRC INCLUDE REGEX ${SINGLE_OP}) - endif() execute_process( COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++11 ${OPBUILD_OPS_SRC} - -D_GLIBCXX_USE_CXX11_ABI=0 -I ${CANN_INCLUDE_PATH} -L ${CANN_LIB_PATH} -lexe_graph -lregister -ltiling_api -o + -D_GLIBCXX_USE_CXX11_ABI=0 -I ${CANN_INCLUDE_PATH} -L ${CANN_LIB_PATH} + -lexe_graph -lregister -ltiling_api -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so RESULT_VARIABLE EXEC_RESULT OUTPUT_VARIABLE EXEC_INFO @@ -180,61 +177,68 @@ function(add_bin_compile_target) endif() add_custom_target(${BINCMP_TARGET} COMMAND cp -r ${BINCMP_IMPL_DIR}/*.* ${BINCMP_OUT_DIR}/src) - add_custom_target( - ${BINCMP_TARGET}_gen_ops_config ALL - COMMAND - ${ASCEND_PYTHON_EXECUTABLE} - ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p - ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} - COMMAND - ${ASCEND_PYTHON_EXECUTABLE} - ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p - ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} -s ${BINCMP_COMPUTE_UNIT}) - file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*.sh) - foreach(bin_script ${bin_scripts}) - get_filename_component(bin_file ${bin_script} NAME_WE) - string(REPLACE "-" ";" bin_sep ${bin_file}) - list(GET bin_sep 0 op_type) - list(GET bin_sep 1 op_file) - list(GET bin_sep 2 op_index) - if(NOT TARGET ${BINCMP_TARGET}_${op_file}_copy) - add_custom_target( - ${BINCMP_TARGET}_${op_file}_copy - COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py - ${BINCMP_OUT_DIR}/src/${op_type}.py - DEPENDS ascendc_impl_gen) - install( - DIRECTORY ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file} - DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT} - OPTIONAL) - install( - FILES ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/${op_file}.json - DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} - OPTIONAL) - endif() + + file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*${KERNEL_NAME}*.sh) + # if bin_scripts not empty + if(bin_scripts) add_custom_target( - ${BINCMP_TARGET}_${op_file}_${op_index} + ${BINCMP_TARGET}_gen_ops_config ALL COMMAND - export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && export - ASCEND_CUSTOM_OPP_PATH=${MX_DRIVING_PATH}/packages/vendors/${vendor_name} - && bash ${CMAKE_SOURCE_DIR}/scripts/retry.sh \"bash ${bin_script} ${BINCMP_OUT_DIR}/src/${op_type}.py -${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}\" - WORKING_DIRECTORY ${BINCMP_OUT_DIR}) - add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET} - ${BINCMP_TARGET}_${op_file}_copy) - add_dependencies(${BINCMP_TARGET}_gen_ops_config - ${BINCMP_TARGET}_${op_file}_${op_index}) - endforeach() - add_custom_command( - TARGET ${BINCMP_TARGET}_gen_ops_config - POST_BUILD - COMMAND mv ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/*.json - ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}) - install( - FILES - ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/binary_info_config.json - DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} - OPTIONAL) + ${ASCEND_PYTHON_EXECUTABLE} + ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p + ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} + COMMAND + ${ASCEND_PYTHON_EXECUTABLE} + ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p + ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} -s ${BINCMP_COMPUTE_UNIT}) + + foreach(bin_script ${bin_scripts}) + get_filename_component(bin_file ${bin_script} NAME_WE) + string(REPLACE "-" ";" bin_sep ${bin_file}) + list(GET bin_sep 0 op_type) + list(GET bin_sep 1 op_file) + list(GET bin_sep 2 op_index) + if(NOT TARGET ${BINCMP_TARGET}_${op_file}_copy) + add_custom_target( + ${BINCMP_TARGET}_${op_file}_copy + COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py + ${BINCMP_OUT_DIR}/src/${op_type}.py + DEPENDS ascendc_impl_gen) + install( + DIRECTORY ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file} + DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT} + OPTIONAL) + install( + FILES + ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/${op_file}.json + DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} + OPTIONAL) + endif() + add_custom_target( + ${BINCMP_TARGET}_${op_file}_${op_index} + COMMAND + export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && export + ASCEND_CUSTOM_OPP_PATH=${MX_DRIVING_PATH}/packages/vendors/${vendor_name} + && bash ${CMAKE_SOURCE_DIR}/scripts/retry.sh \"bash ${bin_script} + ${BINCMP_OUT_DIR}/src/${op_type}.py + ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}\" + WORKING_DIRECTORY ${BINCMP_OUT_DIR}) + add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET} + ${BINCMP_TARGET}_${op_file}_copy) + add_dependencies(${BINCMP_TARGET}_gen_ops_config + ${BINCMP_TARGET}_${op_file}_${op_index}) + endforeach() + add_custom_command( + TARGET ${BINCMP_TARGET}_gen_ops_config + POST_BUILD + COMMAND mv ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/*.json + ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}) + install( + FILES + ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/binary_info_config.json + DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} + OPTIONAL) + endif() endfunction() function(protobuf_generate) diff --git a/cmake/stage_0.cmake b/cmake/stage_0.cmake new file mode 100644 index 00000000..426e8015 --- /dev/null +++ b/cmake/stage_0.cmake @@ -0,0 +1,11 @@ +add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC}) +target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11 + -D_GLIBCXX_USE_CXX11_ABI=0) +target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH}) +target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register + tiling_api) +add_custom_command( + TARGET ascend_all_ops + POST_BUILD + COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build + $ ${ASCEND_AUTOGEN_PATH}) diff --git a/cmake/stage_1.cmake b/cmake/stage_1.cmake new file mode 100644 index 00000000..9a176248 --- /dev/null +++ b/cmake/stage_1.cmake @@ -0,0 +1,212 @@ +# ===================Build proto =================== +add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden) +target_link_libraries( + cust_op_proto + PRIVATE intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0) +install_target( + TRG cust_op_proto DST + packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST + packages/vendors/${vendor_name}/op_proto/inc) + +add_library(cust_optiling SHARED ${ASCEND_HOST_SRC}) +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE -fvisibility=hidden) +target_link_libraries( + cust_optiling + PRIVATE intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0) +install_target( + TRG + cust_optiling + DST + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR} +) +# create liboptiling.so link +add_custom_command( + TARGET cust_optiling + POST_BUILD + COMMAND + ${CMAKE_COMMAND} -E chdir + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling + ${CMAKE_COMMAND} -E create_symlink + lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + liboptiling.so) +install( + FILES + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) + +if(${ENABLE_ONNX}) + if(CANN_PATHS) + if(${ARCH} STREQUAL "aarch64") + protobuf_generate( + PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + else() + protobuf_generate( + PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + endif() + else() + protobuf_generate( + PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto + OUT_DIR ${ASCEND_AUTOGEN_PATH}) + endif() + + add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC}) + target_compile_options( + cust_onnx_parsers + PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private + "-fno-common" "-fno-strict-aliasing") + target_link_libraries(cust_onnx_parsers PRIVATE intf_pub) + target_include_directories( + cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include + ${ASCEND_AUTOGEN_PATH}) + + install_target(TRG cust_onnx_parsers DST + packages/vendors/${vendor_name}/framework/onnx/) +endif() + +# ===================Build ACLNN=================== +file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) +set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM}) +set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM}) +add_library(cust_opapi SHARED ${ACLNN_SRC}) +target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi) +install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib) +install_file(TRG cust_opapi SRC ${ACLNN_INC} DST + packages/vendors/${vendor_name}/op_api/include) + +# ===================Build Kernel=================== +# set custom compile options +if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH}) + +foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini) + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target( + TARGET + ops_info_gen_${compute_unit} + OUTPUT + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if(NOT TARGET ascendc_impl_gen) + add_ops_impl_target( + TARGET + ascendc_impl_gen + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR + ${ASCEND_KERNEL_PATH} + OUT_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + install_file( + TRG + ascendc_impl_gen + SRC + ${ASCEND_KERNEL_SRC} + DST + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) + endif() + + # dynamic shape binary compile + if(${ENABLE_BINARY_PACKAGE}) + add_bin_compile_target( + TARGET + ascendc_bin_${compute_unit} + OPS_INFO + ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR + ${ASCEND_KERNEL_PATH} + ADP_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + OUT_DIR + ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + KERNEL_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + INSTALL_DIR + packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT + ${compute_unit}) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen + cust_optiling) + endif() + endif() +endforeach() + +# generate npu_supported_ops.json +add_npu_support_target( + TARGET + npu_supported_ops + OPS_INFO_DIR + ${ASCEND_AUTOGEN_PATH} + OUT_DIR + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core + INSTALL_DIR + packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}) + +# ===================Build test=================== +# WARN: WIP +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +get_system_info(SYSTEM_INFO) + +# gen version.info +add_custom_target( + gen_version_info ALL + COMMAND + bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh + ${ASCEND_CANN_PACKAGE_PATH} + ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}) + +install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info + DESTINATION packages/vendors/${vendor_name}) + +if(COMPILE_OPP_PACKAGE) + # CPack config + set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) + set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) + set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") + set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) + set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") + set(CPACK_GENERATOR External) + set(CPACK_CMAKE_GENERATOR "Unix Makefiles") + set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) + set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) + set(CPACK_EXTERNAL_BUILT_PACKAGES + ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME} + ) + include(CPack) +endif() diff --git a/cmake/stage_2.cmake b/cmake/stage_2.cmake new file mode 100644 index 00000000..2f736ef3 --- /dev/null +++ b/cmake/stage_2.cmake @@ -0,0 +1,48 @@ +set(Python3_USE_STATIC_LIBS FALSE) +find_package(Python3 COMPONENTS Interpreter Development) + +execute_process( + COMMAND ${Python3_EXECUTABLE} -c + "import os; import torch; print(os.path.dirname(torch.__file__))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_PATH) +execute_process( + COMMAND + ${Python3_EXECUTABLE} -c + "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_NPU_PATH) +message("TORCH_PATH is ${TORCH_PATH}") +message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") + +set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}") +separate_arguments(EXT_CXX_FLAGS) +add_library(_C SHARED ${ASCEND_CSRC_SRC}) +if(${COMPILE_WITH_XLA}) + target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA) +endif() +target_compile_options( + _C + PRIVATE -fprofile-arcs + -ftest-coverage + -fPIC + -g + -O3 + -fstack-protector-all + -DTORCH_API_INCLUDE_EXTENSION_H + -DTORCH_EXTENSION_NAME=_C + -D_GLIBCXX_USE_CXX11_ABI=0 + -D__FILENAME__=__FILE__ + ${EXT_CXX_FLAGS}) +target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib) +target_include_directories( + _C + PRIVATE ${Python3_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include + ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include + ${TORCH_PATH}/include/torch/csrc/api/include) +target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu) +set_target_properties( + _C + PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}" + PREFIX "" + SUFFIX ".so") diff --git a/cmake/util/ascendc_bin_param_build.py b/cmake/util/ascendc_bin_param_build.py index 63b15c90..a46d397f 100644 --- a/cmake/util/ascendc_bin_param_build.py +++ b/cmake/util/ascendc_bin_param_build.py @@ -110,6 +110,8 @@ class BinParamBuilder(opdesc_parser.OpDesc): fd.write(chk) chk = const_var.CHK_CMD.format(res_file=bin_file + '.o') fd.write(chk) + chm = const_var.CHM_CMD + fd.write(chm) fd.write('echo "[{}] Generating {} Done"\n'.format(hard_soc, bin_file)) diff --git a/cmake/util/const_var.py b/cmake/util/const_var.py index 85988e9e..f0d28170 100644 --- a/cmake/util/const_var.py +++ b/cmake/util/const_var.py @@ -29,5 +29,6 @@ if ! test -f $2/{res_file} ; then exit 1 fi ''' +CHM_CMD = 'chmod -R 755 $2\n' ATTR_DEF_VAL = {'str' : '', 'int': 0, 'float': 0.0, 'bool': False, 'list_bool': [], 'list_int': [], 'list_float': [], 'list_list_int': [[]]} diff --git a/docs/api/README.md b/docs/api/README.md index 3afd2d59..381bb23d 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -871,7 +871,7 @@ voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max') ## unique_voxel ### 接口原型 ```python -ads_c.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor +mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor ``` ### 功能描述 对输入的点云数据进行去重处理。 @@ -894,7 +894,7 @@ N的大小受限于内存大小,建议N小于等于2^32。 import torch import torch_npu import numpy as np -from ads_c import unique_voxel +from mx_driving._C import unique_voxel voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32) voxels_npu = torch.from_numpy(voxels).npu() num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu) diff --git a/include/csrc/pybind.h b/include/csrc/pybind.h index b5eb79e4..49ac2037 100644 --- a/include/csrc/pybind.h +++ b/include/csrc/pybind.h @@ -15,7 +15,7 @@ // limitations under the License. #ifndef CSRC_PYBIND_H_ #define CSRC_PYBIND_H_ -#include +#include void init_common(pybind11::module& m); void init_fused(pybind11::module& m); diff --git a/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py b/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py index 48447994..c5c69fe3 100644 --- a/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py +++ b/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py @@ -26,16 +26,16 @@ import mmcv import cv2 as cv from projects.mmdet3d_plugin.models.utils.visual import save_tensor import mx_driving.common -import ads_c +import mx_driving._C from mmdet.models.builder import build_loss def custom_unique_n3(coors, return_inverse, return_counts, dim): # assert dim == 0 - voxels = ads_c.point_to_voxel(coors, [], [], "ZYX") - cnt, unq_voxels, unq_ind, argsort_ind, _ = ads_c.unique_voxel(voxels) - unq_coors = ads_c.voxel_to_point(unq_voxels, [], [], "ZYX") + voxels = mx_driving._C.point_to_voxel(coors, [], [], "ZYX") + cnt, unq_voxels, unq_ind, argsort_ind, _ = mx_driving._C.unique_voxel(voxels) + unq_coors = mx_driving._C.voxel_to_point(unq_voxels, [], [], "ZYX") if return_inverse: sorted_ind = torch.argsort(argsort_ind.to(torch.float32), dim=dim).to(torch.long) diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py index 3b73dec6..3ab0077d 100644 --- a/mx_driving/__init__.py +++ b/mx_driving/__init__.py @@ -2,7 +2,7 @@ import os import torch import torch_npu -import ads_c +import mx_driving._C def _set_env(): @@ -15,7 +15,7 @@ def _set_env(): os.environ["ASCEND_CUSTOM_OPP_PATH"] = ascend_custom_opp_path mx_driving_op_api_so_path = os.path.join(mx_driving_opp_path, "op_api", "lib", "libcust_opapi.so") - ads_c._init_op_api_so_path(mx_driving_op_api_so_path) + mx_driving._C._init_op_api_so_path(mx_driving_op_api_so_path) _set_env() diff --git a/mx_driving/common/CMakeLists.txt b/mx_driving/common/CMakeLists.txt index 0a8fa99d..807aa0c6 100644 --- a/mx_driving/common/CMakeLists.txt +++ b/mx_driving/common/CMakeLists.txt @@ -2,3 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) add_subdirectory(ops/kernels) endif() +if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) + add_subdirectory(ops/onnx/plugin) +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/common/ops/assign_score_withk.py b/mx_driving/common/ops/assign_score_withk.py index c375eaf4..f17773ad 100644 --- a/mx_driving/common/ops/assign_score_withk.py +++ b/mx_driving/common/ops/assign_score_withk.py @@ -6,24 +6,25 @@ Modification date: 2024-10-06 Modification Description: Modification 1. Add support for Ascend NPU """ + import torch +import torch_npu from torch.autograd import Function from torch.nn import Module -import torch_npu -import ads_c +import mx_driving._C class AssignScoreWithkFunction(Function): @staticmethod def forward(ctx, *args): scores, point_features, center_features, knn_idx, aggregate = args - agg = {'sum': 0, 'avg': 1, 'max': 2} + agg = {"sum": 0, "avg": 1, "max": 2} B, N, M, out_dim = point_features.size() _, npoint, K, _ = scores.size() agg_idx = 0 if aggregate not in agg.keys() else agg[aggregate] output = point_features.new_zeros((B, out_dim, npoint, K)) - ads_c.assign_score_withk( + mx_driving._C.assign_score_withk( point_features.contiguous(), center_features.contiguous(), scores.contiguous(), @@ -38,4 +39,4 @@ class AssignScoreWithkFunction(Function): agg_idx ) return output -assign_score_withk = AssignScoreWithkFunction.apply \ No newline at end of file +assign_score_withk = AssignScoreWithkFunction.apply diff --git a/mx_driving/common/ops/csrc/CMakeLists.txt b/mx_driving/common/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/common/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/common/ops/hypot.py b/mx_driving/common/ops/hypot.py index df4315f0..0eb760bd 100644 --- a/mx_driving/common/ops/hypot.py +++ b/mx_driving/common/ops/hypot.py @@ -1,26 +1,29 @@ """ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. """ + import torch +import torch_npu from torch.autograd import Function -import torch_npu -import ads_c +import mx_driving._C class Hypot(Function): @staticmethod def forward(ctx, x, y): x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y) - out = ads_c.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous()) - ctx.save_for_backward(x, y, out); + out = mx_driving._C.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous()) + ctx.save_for_backward(x, y, out) return out @staticmethod def backward(ctx, out_grad): x, y, out = ctx.saved_tensors x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y) - x_grad, y_grad = ads_c.npu_hypot_grad(x_broadcasted.contiguous(), y_broadcasted.contiguous(), out, out_grad) + x_grad, y_grad = mx_driving._C.npu_hypot_grad( + x_broadcasted.contiguous(), y_broadcasted.contiguous(), out, out_grad + ) # reshape the broadcasted tensors to origin tensors and sum the grad for dim, size in enumerate(x.shape): @@ -32,4 +35,5 @@ class Hypot(Function): return x_grad, y_grad + hypot = Hypot.apply diff --git a/mx_driving/common/ops/knn.py b/mx_driving/common/ops/knn.py index 0079669c..bbd9cd5a 100644 --- a/mx_driving/common/ops/knn.py +++ b/mx_driving/common/ops/knn.py @@ -12,7 +12,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsKnn(Function): @@ -43,7 +43,7 @@ class AdsKnn(Function): print('center_xyz and xyz should be on the same device.') return None - dist2, idx = ads_c.knn(xyz, center_xyz, k, True) + dist2, idx = mx_driving._C.knn(xyz, center_xyz, k, True) zeros_idx = torch.zeros(xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu() idx.where(dist2 >= 1e10, zeros_idx) idx = idx.transpose(2, 1).contiguous() # [B, k, npoint] @@ -51,4 +51,4 @@ class AdsKnn(Function): return idx.int() -knn = AdsKnn.apply \ No newline at end of file +knn = AdsKnn.apply diff --git a/mx_driving/common/ops/npu_hypot.py b/mx_driving/common/ops/npu_hypot.py new file mode 100644 index 00000000..25cbfe8b --- /dev/null +++ b/mx_driving/common/ops/npu_hypot.py @@ -0,0 +1,18 @@ +""" +Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +""" +import torch +from torch.autograd import Function + +import torch_npu +import mx_driving._C + + +class Hypot(Function): + @staticmethod + def forward(ctx, x, y): + x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y) + out = mx_driving._C.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous()) + return out + +npu_hypot = Hypot.apply \ No newline at end of file diff --git a/mx_driving/common/ops/npu_scatter_mean_grad.py b/mx_driving/common/ops/npu_scatter_mean_grad.py index 57a597cf..d460f7bf 100644 --- a/mx_driving/common/ops/npu_scatter_mean_grad.py +++ b/mx_driving/common/ops/npu_scatter_mean_grad.py @@ -6,12 +6,12 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class ScatterMeanGradFunction(Function): @staticmethod def forward(ctx, grad_out, index, dim): - result = ads_c.npu_scatter_mean_grad(grad_out, index, dim) + result = mx_driving._C.npu_scatter_mean_grad(grad_out, index, dim) return result npu_scatter_mean_grad = ScatterMeanGradFunction.apply \ No newline at end of file diff --git a/mx_driving/common/ops/scatter_max.py b/mx_driving/common/ops/scatter_max.py index 2cba77bb..b30c6139 100644 --- a/mx_driving/common/ops/scatter_max.py +++ b/mx_driving/common/ops/scatter_max.py @@ -11,13 +11,13 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class ScatterMaxFunction(Function): @staticmethod def forward(ctx, updates, indices, out=None): - func = ads_c.scatter_max_with_argmax_v2 + func = mx_driving._C.scatter_max_with_argmax_v2 out, argmax = func(updates, indices, out) ctx.save_for_backward(argmax, updates) return out, argmax @@ -33,7 +33,7 @@ class ScatterMaxFunction(Function): grad_updates_indices_uss = grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1] num_segments = torch.tensor(updates.shape[0] * updates.shape[1]).to(device) - grad = ads_c.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments) + grad = mx_driving._C.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments) return grad.reshape(updates.shape), None, None diff --git a/mx_driving/common/ops/scatter_mean.py b/mx_driving/common/ops/scatter_mean.py index 61d8019d..cd35b8bb 100644 --- a/mx_driving/common/ops/scatter_mean.py +++ b/mx_driving/common/ops/scatter_mean.py @@ -3,13 +3,13 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class ScatterMeanFunction(Function): @staticmethod def forward(ctx, src, index, out=None, dim=0, dim_size=None): - func = ads_c.npu_scatter_mean + func = mx_driving._C.npu_scatter_mean res, count = func(src, index, out, dim, dim_size) return res diff --git a/mx_driving/common/ops/sort_pairs.py b/mx_driving/common/ops/sort_pairs.py index 99577885..e6c6f0b4 100644 --- a/mx_driving/common/ops/sort_pairs.py +++ b/mx_driving/common/ops/sort_pairs.py @@ -1,11 +1,13 @@ -import ads_c import torch +import mx_driving._C + class SortPairs(torch.autograd.Function): @staticmethod def forward(ctx, keys_in, values_in, dim, descending=False): - res = ads_c.npu_sort_pairs(keys_in, values_in, dim, descending) + res = mx_driving._C.npu_sort_pairs(keys_in, values_in, dim, descending) return res + sort_pairs = SortPairs.apply diff --git a/mx_driving/common/ops/threeNN.py b/mx_driving/common/ops/threeNN.py index 11047daf..d259e5d0 100644 --- a/mx_driving/common/ops/threeNN.py +++ b/mx_driving/common/ops/threeNN.py @@ -12,7 +12,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsThreeNN(Function): @@ -27,11 +27,11 @@ class AdsThreeNN(Function): target = target.float() source = source.float() - dist2, idx = ads_c.knn(source, target, 3, False) + dist2, idx = mx_driving._C.knn(source, target, 3, False) dist2 = torch.sqrt(dist2) if dtype_ == torch.float16: dist2 = dist2.half() return dist2, idx.int() -three_nn = AdsThreeNN.apply \ No newline at end of file +three_nn = AdsThreeNN.apply diff --git a/mx_driving/common/ops/three_interpolate.py b/mx_driving/common/ops/three_interpolate.py index 05526652..ed237c10 100644 --- a/mx_driving/common/ops/three_interpolate.py +++ b/mx_driving/common/ops/three_interpolate.py @@ -14,7 +14,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class ThreeInterpolateFunction(Function): @@ -27,7 +27,7 @@ class ThreeInterpolateFunction(Function): n = indices.size(1) ctx.three_interpolate_for_backward = (indices, weight, m) - func = ads_c.npu_three_interpolate + func = mx_driving._C.npu_three_interpolate out = func(b, c, m, n, features, indices, weight) return out @@ -42,7 +42,7 @@ class ThreeInterpolateFunction(Function): grad_out_data = grad_out.data.contiguous().to(torch.float) weight = weight.to(torch.float) - grad_features = ads_c.npu_three_interpolate_backward(b, c, n, m, grad_out_data, idx, weight) + grad_features = mx_driving._C.npu_three_interpolate_backward(b, c, n, m, grad_out_data, idx, weight) if grad_out_dtype == torch.half: grad_features = grad_features.to(torch.half) diff --git a/mx_driving/detection/CMakeLists.txt b/mx_driving/detection/CMakeLists.txt index 3f1ac043..63ebf651 100644 --- a/mx_driving/detection/CMakeLists.txt +++ b/mx_driving/detection/CMakeLists.txt @@ -2,6 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) add_subdirectory(ops/kernels) endif() -if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) +if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) add_subdirectory(ops/onnx/plugin) endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/detection/ops/border_align.py b/mx_driving/detection/ops/border_align.py index 242b6dbe..92fac131 100644 --- a/mx_driving/detection/ops/border_align.py +++ b/mx_driving/detection/ops/border_align.py @@ -1,14 +1,15 @@ """ Copyright (c) OpenMMLab. All rights reserved. """ + from typing import Any, Optional, Tuple, Union import torch -import torch_npu import torch.nn as nn +import torch_npu from torch.autograd import Function -import ads_c +import mx_driving._C class BorderAlignFunction(Function): @@ -17,19 +18,27 @@ class BorderAlignFunction(Function): ctx.pooled_size = pooled_size ctx.feature_size = feature_map.size() batch_size, num_channels, data_height, data_width = feature_map.size() - output = torch.zeros([batch_size, data_height * data_width, ctx.pooled_size + 1, num_channels]).to(feature_map.device) - - ads_c.border_align_forward_npu( - feature_map, - rois, - output, - ctx.pooled_size) - + output = torch.zeros([batch_size, data_height * data_width, ctx.pooled_size + 1, num_channels]).to( + feature_map.device + ) + + mx_driving._C.border_align_forward_npu(feature_map, rois, output, ctx.pooled_size) + npu_outputs, index = output.max(dim=-2) - npu_outputs = npu_outputs.reshape([batch_size, data_height * data_width, 4, num_channels // 4]).permute([0, 3, 1, 2]).contiguous() - index = index.int().reshape([batch_size, data_height * data_width, 4, num_channels // 4]).permute([0, 3, 1, 2]).contiguous() + npu_outputs = ( + npu_outputs.reshape([batch_size, data_height * data_width, 4, num_channels // 4]) + .permute([0, 3, 1, 2]) + .contiguous() + ) + index = ( + index.int() + .reshape([batch_size, data_height * data_width, 4, num_channels // 4]) + .permute([0, 3, 1, 2]) + .contiguous() + ) ctx.save_for_backward(rois, index) return npu_outputs -border_align = BorderAlignFunction.apply \ No newline at end of file + +border_align = BorderAlignFunction.apply diff --git a/mx_driving/detection/ops/box_iou.py b/mx_driving/detection/ops/box_iou.py index 6c455f52..dd3cb320 100644 --- a/mx_driving/detection/ops/box_iou.py +++ b/mx_driving/detection/ops/box_iou.py @@ -1,27 +1,27 @@ -import ads_c import torch +import mx_driving._C + class BoxIouQuadri(torch.autograd.Function): @staticmethod def forward(ctx, boxes_a, boxes_b, mode, aligned): - mode_dict = {'iou': 0, 'iof': 1} + mode_dict = {"iou": 0, "iof": 1} mode_flag = mode_dict[mode] - + boxes_a = boxes_a.contiguous() boxes_b = boxes_b.contiguous() - - ious = ads_c.npu_box_iou_quadri( - boxes_a, boxes_b, mode_flag, aligned) + + ious = mx_driving._C.npu_box_iou_quadri(boxes_a, boxes_b, mode_flag, aligned) return ious class BoxIouRotated(torch.autograd.Function): @staticmethod def forward(ctx, boxes_a, boxes_b, mode, aligned, clockwise): - mode_dict = {'iou': 0, 'iof': 1} + mode_dict = {"iou": 0, "iof": 1} mode_flag = mode_dict[mode] - + if not clockwise: flip_mat = boxes_a.new_ones(boxes_a.shape[-1]) flip_mat[-1] = -1 @@ -29,9 +29,8 @@ class BoxIouRotated(torch.autograd.Function): boxes_b = boxes_b * flip_mat boxes_a = boxes_a.contiguous() boxes_b = boxes_b.contiguous() - - ious = ads_c.npu_box_iou_rotated( - boxes_a, boxes_b, mode_flag, aligned) + + ious = mx_driving._C.npu_box_iou_rotated(boxes_a, boxes_b, mode_flag, aligned) return ious diff --git a/mx_driving/detection/ops/boxes_overlap_bev.py b/mx_driving/detection/ops/boxes_overlap_bev.py index 6b6d1874..cd92e3ab 100644 --- a/mx_driving/detection/ops/boxes_overlap_bev.py +++ b/mx_driving/detection/ops/boxes_overlap_bev.py @@ -1,13 +1,14 @@ import warnings -import ads_c import torch +import mx_driving._C + class BoxesOverlapBev(torch.autograd.Function): @staticmethod def forward(ctx, boxes_a, boxes_b): - area_overlap = ads_c.npu_boxes_overlap_bev(boxes_a, boxes_b) + area_overlap = mx_driving._C.npu_boxes_overlap_bev(boxes_a, boxes_b) return area_overlap diff --git a/mx_driving/detection/ops/csrc/CMakeLists.txt b/mx_driving/detection/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/detection/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/detection/ops/nms3d_normal.py b/mx_driving/detection/ops/nms3d_normal.py index 6e604f38..c6b297cc 100644 --- a/mx_driving/detection/ops/nms3d_normal.py +++ b/mx_driving/detection/ops/nms3d_normal.py @@ -10,7 +10,7 @@ import torch import torch_npu from torch.autograd import Function from torch.nn import Module -import ads_c +import mx_driving._C class AdsNms3dNormalFunction(Function): @@ -21,7 +21,7 @@ class AdsNms3dNormalFunction(Function): order = scores.sort(0, descending=True)[1] boxes = boxes[order].contiguous() - keep, num_out = ads_c.nms3d_normal(boxes, iou_threshold) + keep, num_out = mx_driving._C.nms3d_normal(boxes, iou_threshold) return order[keep[:num_out].long()].contiguous() npu_nms3d_normal = AdsNms3dNormalFunction.apply diff --git a/mx_driving/detection/ops/npu_nms3d.py b/mx_driving/detection/ops/npu_nms3d.py index 744dfe98..0935ee9b 100644 --- a/mx_driving/detection/ops/npu_nms3d.py +++ b/mx_driving/detection/ops/npu_nms3d.py @@ -10,7 +10,7 @@ import torch from torch.autograd import Function import torch_npu -import ads_c +import mx_driving._C class Nms3dFunction(Function): @@ -21,7 +21,7 @@ class Nms3dFunction(Function): order = scores.sort(0, descending=True)[1] boxes = boxes[order].contiguous() - keep, num_out = ads_c.nms3d(boxes, iou_threshold) + keep, num_out = mx_driving._C.nms3d(boxes, iou_threshold) return order[keep[:num_out].long()].contiguous() diff --git a/mx_driving/detection/ops/roi_align_rotated.py b/mx_driving/detection/ops/roi_align_rotated.py index 7183c78a..3ed1c3ea 100644 --- a/mx_driving/detection/ops/roi_align_rotated.py +++ b/mx_driving/detection/ops/roi_align_rotated.py @@ -8,7 +8,7 @@ import torch_npu import torch.nn as nn from torch.autograd import Function -import ads_c +import mx_driving._C class RoIAlignRotatedFunction(Function): @@ -28,7 +28,7 @@ class RoIAlignRotatedFunction(Function): output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to(feature_map.device) - ads_c.roi_align_rotated_v2_forward_npu( + mx_driving._C.roi_align_rotated_v2_forward_npu( feature_map, rois, output, @@ -46,7 +46,7 @@ class RoIAlignRotatedFunction(Function): feature_map, rois = ctx.saved_tensors rois_trans = torch.permute(rois, (1, 0)).contiguous() grad_output_trans = torch.permute(grad_output, (0, 2, 3, 1)).contiguous() - grad_feature_map = ads_c.npu_roi_align_rotated_grad_v2( + grad_feature_map = mx_driving._C.npu_roi_align_rotated_grad_v2( feature_map, rois_trans, grad_output_trans, ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale, ctx.sampling_ratio, ctx.aligned, ctx.clockwise) diff --git a/mx_driving/detection/ops/rotated_iou.py b/mx_driving/detection/ops/rotated_iou.py index e82c7a09..a09d88c9 100644 --- a/mx_driving/detection/ops/rotated_iou.py +++ b/mx_driving/detection/ops/rotated_iou.py @@ -3,6 +3,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. """ import torch import torch_npu -import ads_c +import mx_driving._C -npu_rotated_iou = ads_c.npu_rotated_iou +npu_rotated_iou = mx_driving._C.npu_rotated_iou diff --git a/mx_driving/detection/ops/rotated_overlaps.py b/mx_driving/detection/ops/rotated_overlaps.py index b992a72f..5afab83d 100644 --- a/mx_driving/detection/ops/rotated_overlaps.py +++ b/mx_driving/detection/ops/rotated_overlaps.py @@ -3,6 +3,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. """ import torch import torch_npu -import ads_c +import mx_driving._C -npu_rotated_overlaps = ads_c.npu_rotated_overlaps +npu_rotated_overlaps = mx_driving._C.npu_rotated_overlaps diff --git a/mx_driving/fused/CMakeLists.txt b/mx_driving/fused/CMakeLists.txt index 4b3aa985..807aa0c6 100644 --- a/mx_driving/fused/CMakeLists.txt +++ b/mx_driving/fused/CMakeLists.txt @@ -4,4 +4,8 @@ endif() if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) add_subdirectory(ops/onnx/plugin) -endif() \ No newline at end of file +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/fused/ops/csrc/CMakeLists.txt b/mx_driving/fused/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/fused/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/fused/ops/deform_conv2d.py b/mx_driving/fused/ops/deform_conv2d.py index 6f191942..dcddc6ee 100644 --- a/mx_driving/fused/ops/deform_conv2d.py +++ b/mx_driving/fused/ops/deform_conv2d.py @@ -13,7 +13,7 @@ from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair import torch_npu -import ads_c +import mx_driving._C class DeformConv2dFunction(Function): @@ -41,7 +41,7 @@ class DeformConv2dFunction(Function): nhwc_offset = offset.permute(0, 2, 3, 1).contiguous() nhwc_weight = weight.permute(0, 2, 3, 1).contiguous() - out, offset_output = ads_c.deformable_conv2d( + out, offset_output = mx_driving._C.deformable_conv2d( nhwc_x, nhwc_offset, nhwc_weight, @@ -61,7 +61,7 @@ class DeformConv2dFunction(Function): def backward(ctx, grad_out): nhwc_x, nhwc_offset, nhwc_weight, offset_output = ctx.saved_tensors nhwc_grad_out = grad_out.permute(0, 2, 3, 1).contiguous() - grad_x, grad_weight, grad_offset = ads_c.deformable_conv2d_backward( + grad_x, grad_weight, grad_offset = mx_driving._C.deformable_conv2d_backward( nhwc_x, nhwc_weight, nhwc_offset, diff --git a/mx_driving/fused/ops/fused_bias_leaky_relu.py b/mx_driving/fused/ops/fused_bias_leaky_relu.py index aa2a7f79..1ed057af 100644 --- a/mx_driving/fused/ops/fused_bias_leaky_relu.py +++ b/mx_driving/fused/ops/fused_bias_leaky_relu.py @@ -11,13 +11,13 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class FusedBiasLeakyReluFunction(Function): @staticmethod def forward(ctx, x, bias, negative_slop=0.2, scale=2**0.5): - y = ads_c.fused_bias_leaky_relu(x, bias, negative_slop, scale) + y = mx_driving._C.fused_bias_leaky_relu(x, bias, negative_slop, scale) return y npu_fused_bias_leaky_relu = FusedBiasLeakyReluFunction.apply \ No newline at end of file diff --git a/mx_driving/fused/ops/modulated_deform_conv2d.py b/mx_driving/fused/ops/modulated_deform_conv2d.py index b9030cda..8e5c1ead 100644 --- a/mx_driving/fused/ops/modulated_deform_conv2d.py +++ b/mx_driving/fused/ops/modulated_deform_conv2d.py @@ -14,7 +14,7 @@ from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair import torch_npu -import ads_c +import mx_driving._C class ModulatedDeformConv2dFunction(Function): @@ -45,7 +45,7 @@ class ModulatedDeformConv2dFunction(Function): nhwc_weight = weight.permute(0, 2, 3, 1).contiguous() nhwc_mask = mask.permute(0, 2, 3, 1).contiguous() - out, offset_output = ads_c.modulated_deformable_conv2d( + out, offset_output = mx_driving._C.modulated_deformable_conv2d( nhwc_x, nhwc_offset, nhwc_mask, @@ -68,7 +68,7 @@ class ModulatedDeformConv2dFunction(Function): def backward(ctx, grad_out): nhwc_x, nhwc_offset, nhwc_weight, nhwc_mask, offset_output = ctx.saved_tensors nhwc_grad_out = grad_out.permute(0, 2, 3, 1).contiguous() - grad_x, grad_weight, _, grad_offset, grad_mask = ads_c.modulated_deformable_conv2d_backward( + grad_x, grad_weight, _, grad_offset, grad_mask = mx_driving._C.modulated_deformable_conv2d_backward( nhwc_x, nhwc_offset, nhwc_mask, diff --git a/mx_driving/fused/ops/npu_add_relu.py b/mx_driving/fused/ops/npu_add_relu.py index e04feba8..aa08f38a 100644 --- a/mx_driving/fused/ops/npu_add_relu.py +++ b/mx_driving/fused/ops/npu_add_relu.py @@ -11,20 +11,20 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AddReluFunction(Function): @staticmethod def forward(ctx, x, y): - x = ads_c.npu_add_relu(x, y) + x = mx_driving._C.npu_add_relu(x, y) ctx.save_for_backward(x) return x @staticmethod def backward(ctx, grad_output): x, = ctx.saved_tensors - result = ads_c.npu_add_relu_grad(x, grad_output) + result = mx_driving._C.npu_add_relu_grad(x, grad_output) return result, result npu_add_relu = AddReluFunction.apply \ No newline at end of file diff --git a/mx_driving/fused/ops/npu_deformable_aggregation.py b/mx_driving/fused/ops/npu_deformable_aggregation.py index 417faa49..d6076fb4 100644 --- a/mx_driving/fused/ops/npu_deformable_aggregation.py +++ b/mx_driving/fused/ops/npu_deformable_aggregation.py @@ -4,7 +4,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsDeformableAggregation(Function): @@ -26,7 +26,7 @@ class AdsDeformableAggregation(Function): sampling_location = sampling_location.contiguous().float() weights = weights.contiguous().float() - output = ads_c.npu_deformable_aggregation( + output = mx_driving._C.npu_deformable_aggregation( mc_ms_feat, spatial_shape, scale_start_index, @@ -60,7 +60,7 @@ class AdsDeformableAggregation(Function): grad_mc_ms_feat = torch.zeros_like(mc_ms_feat) grad_sampling_location = torch.zeros_like(sampling_location) grad_weights = torch.zeros_like(weights) - grad_mc_ms_feat, grad_sampling_location, grad_weights = ads_c.npu_deformable_aggregation_grad( + grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_grad( mc_ms_feat, spatial_shape, scale_start_index, diff --git a/mx_driving/fused/ops/npu_max_pool2d.py b/mx_driving/fused/ops/npu_max_pool2d.py index 38b68afb..fa4e72f0 100644 --- a/mx_driving/fused/ops/npu_max_pool2d.py +++ b/mx_driving/fused/ops/npu_max_pool2d.py @@ -7,14 +7,14 @@ Modification Description: Modification 1. Add support for Ascend NPU """ from torch.autograd import Function -import ads_c +import mx_driving._C class MaxPool2d(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def forward(ctx, x, kernel_size, stride, padding): - y = ads_c.npu_max_pool2d(x, kernel_size, stride, padding) + y = mx_driving._C.npu_max_pool2d(x, kernel_size, stride, padding) return y npu_max_pool2d = MaxPool2d.apply diff --git a/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py b/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py index 9097d080..ffdb66c5 100644 --- a/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py +++ b/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py @@ -12,21 +12,21 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class MultiScaleDeformableAttnFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def forward(ctx, value, shape, offset, locations, weight): - result = ads_c.npu_multi_scale_deformable_attn_function(value, shape, offset, locations, weight) + result = mx_driving._C.npu_multi_scale_deformable_attn_function(value, shape, offset, locations, weight) ctx.save_for_backward(value, shape, offset, locations, weight) return result @staticmethod def backward(ctx, grad_output): value, shape, offset, locations, weight = ctx.saved_tensors - grad_value, grad_locations, grad_weight = ads_c.multi_scale_deformable_attn_grad( + grad_value, grad_locations, grad_weight = mx_driving._C.multi_scale_deformable_attn_grad( value, shape, offset, locations, weight, grad_output ) return grad_value, None, None, grad_locations, grad_weight diff --git a/mx_driving/point/CMakeLists.txt b/mx_driving/point/CMakeLists.txt index 621d1fa9..63ebf651 100644 --- a/mx_driving/point/CMakeLists.txt +++ b/mx_driving/point/CMakeLists.txt @@ -1,3 +1,11 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) add_subdirectory(ops/kernels) endif() + +if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) + add_subdirectory(ops/onnx/plugin) +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/point/ops/bev_pool.py b/mx_driving/point/ops/bev_pool.py index 245533d7..4747b852 100644 --- a/mx_driving/point/ops/bev_pool.py +++ b/mx_driving/point/ops/bev_pool.py @@ -1,6 +1,7 @@ -import ads_c import torch +import mx_driving._C + class BEVPool(torch.autograd.Function): @staticmethod @@ -14,7 +15,7 @@ class BEVPool(torch.autograd.Function): interval_lengths[-1] = feat.shape[0] - interval_starts[-1] geom_feat = geom_feat.int() - out = ads_c.npu_bev_pool( + out = mx_driving._C.npu_bev_pool( feat, geom_feat, interval_lengths, @@ -36,7 +37,7 @@ class BEVPool(torch.autograd.Function): B, D, H, W = ctx.saved_shapes grad_out = grad_out.contiguous() - grad_feat = ads_c.npu_bev_pool_backward( + grad_feat = mx_driving._C.npu_bev_pool_backward( grad_out, geom_feat, interval_lengths, diff --git a/mx_driving/point/ops/bev_pool_v2.py b/mx_driving/point/ops/bev_pool_v2.py index e8895480..c21ba995 100644 --- a/mx_driving/point/ops/bev_pool_v2.py +++ b/mx_driving/point/ops/bev_pool_v2.py @@ -13,15 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ads_c import torch +import mx_driving._C + class BEVPoolV2(torch.autograd.Function): @staticmethod # pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, - bev_feat_shape, interval_starts, interval_lengths): + def forward( + ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths + ): ranks_bev = ranks_bev.int() depth = depth.contiguous().float() feat = feat.contiguous().float() @@ -31,18 +33,8 @@ class BEVPoolV2(torch.autograd.Function): interval_starts = interval_starts.contiguous().int() (B, D, H, W, C) = bev_feat_shape - out = ads_c.npu_bev_pool_v2( - depth, - feat, - ranks_depth, - ranks_feat, - ranks_bev, - interval_lengths, - interval_starts, - B, - D, - H, - W + out = mx_driving._C.npu_bev_pool_v2( + depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts, B, D, H, W ) ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth) @@ -56,15 +48,12 @@ class BEVPoolV2(torch.autograd.Function): B, D, H, W = ctx.saved_shapes order = ranks_feat.argsort() - ranks_feat, ranks_depth, ranks_bev = \ - ranks_feat[order], ranks_depth[order], ranks_bev[order] - kept = torch.ones( - ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) + ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order] + kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_feat[1:] != ranks_feat[:-1] interval_starts_bp = torch.where(kept)[0].int() interval_lengths_bp = torch.zeros_like(interval_starts_bp) - interval_lengths_bp[:-1] = interval_starts_bp[ - 1:] - interval_starts_bp[:-1] + interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1] interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1] depth = depth.contiguous() @@ -76,7 +65,7 @@ class BEVPoolV2(torch.autograd.Function): interval_starts_bp = interval_starts_bp.contiguous() grad_out = grad_out.contiguous() - grad_depth, grad_feat = ads_c.npu_bev_pool_v2_backward( + grad_depth, grad_feat = mx_driving._C.npu_bev_pool_v2_backward( grad_out, depth, feat, @@ -88,14 +77,13 @@ class BEVPoolV2(torch.autograd.Function): B, D, H, - W + W, ) return grad_depth, grad_feat, None, None, None, None, None, None # pylint: disable=too-many-arguments,huawei-too-many-arguments -def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, - bev_feat_shape, interval_starts, interval_lengths): +def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths): """ bev_pool_v2 is a function that performs a pooling operation on the BEV. Please refer to the paper `BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View` @@ -132,8 +120,7 @@ def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, >>> loss.backward() """ x = BEVPoolV2.apply( - depth, feat, ranks_depth, ranks_feat, ranks_bev, - bev_feat_shape, interval_starts, interval_lengths + depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths ) x = x.permute(0, 4, 1, 2, 3).contiguous() return x diff --git a/mx_driving/point/ops/csrc/CMakeLists.txt b/mx_driving/point/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/point/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/point/ops/furthest_point_sampling.py b/mx_driving/point/ops/furthest_point_sampling.py index 2b505b62..708251db 100644 --- a/mx_driving/point/ops/furthest_point_sampling.py +++ b/mx_driving/point/ops/furthest_point_sampling.py @@ -12,7 +12,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsFurthestPointSampling(Function): @@ -22,7 +22,7 @@ class AdsFurthestPointSampling(Function): point_xyz = point_xyz.permute(0, 2, 1).contiguous() nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device='npu').contiguous() - output = ads_c.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points) + output = mx_driving._C.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points) return output diff --git a/mx_driving/point/ops/furthest_point_sampling_with_dist.py b/mx_driving/point/ops/furthest_point_sampling_with_dist.py index ad9b906e..f56f104c 100644 --- a/mx_driving/point/ops/furthest_point_sampling_with_dist.py +++ b/mx_driving/point/ops/furthest_point_sampling_with_dist.py @@ -11,7 +11,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsFurthestPointSamplingWithDistFunction(Function): @@ -19,7 +19,7 @@ class AdsFurthestPointSamplingWithDistFunction(Function): def forward(ctx, points_dist, num_points): B, N = points_dist.size()[:2] nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10) - result = ads_c.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points) + result = mx_driving._C.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points) return result furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply diff --git a/mx_driving/point/ops/group_points.py b/mx_driving/point/ops/group_points.py index b6b005f4..523ef73a 100644 --- a/mx_driving/point/ops/group_points.py +++ b/mx_driving/point/ops/group_points.py @@ -20,7 +20,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class AdsGroupPoints(Function): @@ -45,7 +45,7 @@ class AdsGroupPoints(Function): B, C, N = features.size() _, npoints, nsample = indices.size() - output = ads_c.group_points( + output = mx_driving._C.group_points( features, indices, B, @@ -70,7 +70,7 @@ class AdsGroupPoints(Function): idx, N = ctx.for_backwards B, C, npoints, nsample = grad_out.size() - grad_features = ads_c.group_points_backward( + grad_features = mx_driving._C.group_points_backward( grad_out, idx, B, diff --git a/mx_driving/point/ops/npu_dynamic_scatter.py b/mx_driving/point/ops/npu_dynamic_scatter.py index f1598c94..81ae7ff6 100644 --- a/mx_driving/point/ops/npu_dynamic_scatter.py +++ b/mx_driving/point/ops/npu_dynamic_scatter.py @@ -13,7 +13,7 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class DynamicScatterFunction(Function): @@ -24,10 +24,10 @@ class DynamicScatterFunction(Function): if reduce_type not in ('max', 'sum', 'mean'): raise ValueError("reduce_type should be 'max', 'sum' or 'mean', but now is %s." % reduce_type) - voxel_idx = ads_c.point_to_voxel(coors, [], [], "XYZ") - num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = ads_c.unique_voxel(voxel_idx) - voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ") - voxel_feats, compare_mask = ads_c.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel, + voxel_idx = mx_driving._C.point_to_voxel(coors, [], [], "XYZ") + num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel(voxel_idx) + voxel_coors = mx_driving._C.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ") + voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel, argsort_coor, num_voxels, reduce_type) ctx.reduce_type = reduce_type @@ -44,7 +44,7 @@ class DynamicScatterFunction(Function): grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple: (prefix_sum_point_per_voxel, argsort_coor, compare_mask) = ctx.saved_tensors grad_point_feats = torch.zeros(ctx.feats_shape, dtype=grad_voxel_feats.dtype, device=grad_voxel_feats.device) - ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, + mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, argsort_coor, compare_mask, ctx.reduce_type) return grad_point_feats, None, None diff --git a/mx_driving/point/ops/voxel_pooling_train.py b/mx_driving/point/ops/voxel_pooling_train.py index 898c3930..12c61c9d 100644 --- a/mx_driving/point/ops/voxel_pooling_train.py +++ b/mx_driving/point/ops/voxel_pooling_train.py @@ -9,7 +9,7 @@ Modification 1. Add support for Ascend NPU import torch from torch.autograd import Function from torch.nn import Module -import ads_c +import mx_driving._C class AdsVoxelPoolingFunction(Function): @@ -25,7 +25,7 @@ class AdsVoxelPoolingFunction(Function): output_features = input_features.new_zeros(batch_size, voxel_num[1], voxel_num[0], num_channels) pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1 - pos, result = ads_c.voxel_pooling_train( + pos, result = mx_driving._C.voxel_pooling_train( input_features, geom_xyz, output_features, @@ -51,7 +51,7 @@ class AdsVoxelPoolingFunction(Function): H = grad_output_features.shape[2] W = grad_output_features.shape[3] - result = ads_c.voxel_pool_train_backward( + result = mx_driving._C.voxel_pool_train_backward( grad_output_features, pos_memo, batch_size, diff --git a/mx_driving/point/ops/voxelization.py b/mx_driving/point/ops/voxelization.py index 22b7e6a0..3710dda8 100644 --- a/mx_driving/point/ops/voxelization.py +++ b/mx_driving/point/ops/voxelization.py @@ -6,7 +6,7 @@ from typing import Union, Tuple import torch from torch.autograd import Function from torch.nn import Module -import ads_c +import mx_driving._C class _Voxelization(Function): @@ -17,7 +17,7 @@ class _Voxelization(Function): ): if max_points != -1 and max_voxels != -1: - return ads_c.hard_voxelize(points, voxel_size, coors_range, max_points, max_voxels) + return mx_driving._C.hard_voxelize(points, voxel_size, coors_range, max_points, max_voxels) float_espolin = 1e-9 if voxel_size[0] < float_espolin or voxel_size[1] < float_espolin or voxel_size[2] < float_espolin: @@ -30,7 +30,7 @@ class _Voxelization(Function): # create coors coors = points.new_zeros(size=(3, points.size(0)), dtype=torch.int) - result = ads_c.dynamic_voxelization( + result = mx_driving._C.dynamic_voxelization( points, coors, grid_x, diff --git a/mx_driving/preprocess/CMakeLists.txt b/mx_driving/preprocess/CMakeLists.txt index 3f1ac043..63ebf651 100644 --- a/mx_driving/preprocess/CMakeLists.txt +++ b/mx_driving/preprocess/CMakeLists.txt @@ -2,6 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) add_subdirectory(ops/kernels) endif() -if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) +if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) add_subdirectory(ops/onnx/plugin) endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/preprocess/ops/csrc/CMakeLists.txt b/mx_driving/preprocess/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/preprocess/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/preprocess/ops/npu_points_in_box.py b/mx_driving/preprocess/ops/npu_points_in_box.py index df56e264..056df051 100644 --- a/mx_driving/preprocess/ops/npu_points_in_box.py +++ b/mx_driving/preprocess/ops/npu_points_in_box.py @@ -11,13 +11,13 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class PointsInBoxFunction(Function): @staticmethod def forward(ctx, boxes, pts): - result = ads_c.npu_points_in_box(boxes, pts) + result = mx_driving._C.npu_points_in_box(boxes, pts) ctx.save_for_backward(result) return result diff --git a/mx_driving/preprocess/ops/npu_points_in_box_all.py b/mx_driving/preprocess/ops/npu_points_in_box_all.py index 93120303..8f31e175 100644 --- a/mx_driving/preprocess/ops/npu_points_in_box_all.py +++ b/mx_driving/preprocess/ops/npu_points_in_box_all.py @@ -12,13 +12,13 @@ from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class PointsInBoxAllFunction(Function): @staticmethod def forward(ctx, boxes, pts): - result = ads_c.npu_points_in_box_all(boxes, pts) + result = mx_driving._C.npu_points_in_box_all(boxes, pts) ctx.save_for_backward(result) return result diff --git a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py b/mx_driving/preprocess/ops/npu_roipoint_pool3d.py index c5c3d6e5..eae3c110 100644 --- a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py +++ b/mx_driving/preprocess/ops/npu_roipoint_pool3d.py @@ -10,7 +10,7 @@ import torch from torch.autograd import Function from torch.nn import Module import torch_npu -import ads_c +import mx_driving._C class RoipointPool3dFunction(Function): @@ -43,7 +43,7 @@ class RoipointPool3dFunction(Function): # pooled_features = points.new_zeros((batch_size, boxes_num, num_sampled_points, 3 + feature_len)) # pooled_empty_flag = points.new_zeros((batch_size, boxes_num), dtype=torch.int) pooled_features, pooled_empty_flag = \ - ads_c.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d) + mx_driving._C.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d) return pooled_features, pooled_empty_flag diff --git a/mx_driving/spconv/CMakeLists.txt b/mx_driving/spconv/CMakeLists.txt index 621d1fa9..63ebf651 100644 --- a/mx_driving/spconv/CMakeLists.txt +++ b/mx_driving/spconv/CMakeLists.txt @@ -1,3 +1,11 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels) add_subdirectory(ops/kernels) endif() + +if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx) + add_subdirectory(ops/onnx/plugin) +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc) + add_subdirectory(ops/csrc) +endif() diff --git a/mx_driving/spconv/ops/csrc/CMakeLists.txt b/mx_driving/spconv/ops/csrc/CMakeLists.txt new file mode 100644 index 00000000..4a75d495 --- /dev/null +++ b/mx_driving/spconv/ops/csrc/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h) +set(ASCEND_CSRC_SRC + ${ASCEND_CSRC_SRC} ${CSRC_SRC} + CACHE INTERNAL "") diff --git a/mx_driving/spconv/ops/sparse_functional.py b/mx_driving/spconv/ops/sparse_functional.py index eeae65a3..65008c8b 100644 --- a/mx_driving/spconv/ops/sparse_functional.py +++ b/mx_driving/spconv/ops/sparse_functional.py @@ -19,7 +19,7 @@ import torch import numpy as np from torch.autograd import Function from torch.autograd.function import once_differentiable -import ads_c +import mx_driving._C from . import sparse_ops as ops @@ -34,7 +34,7 @@ class SparseConvFunction(Function): device = features.device # calculate the index pair - outidx_pair, ouidx_offset = ads_c.npu_sparse_conv3d(indices, kernel_size, stride, padding, + outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d(indices, kernel_size, stride, padding, out_channels, out_spatial_shape, batch_size) # sort and nonezero to_insert = torch.tensor(-1).to(device) @@ -44,7 +44,7 @@ class SparseConvFunction(Function): sub_result = new_sorted_idx - new_sorted_idx_2 unique_indices_offset = torch.nonzero(sub_result != 0) # index_put and matmul - out_features, outidx = ads_c.multi_to_sparse_v2(features, weight, unique_indices_offset.int(), + out_features, outidx = mx_driving._C.multi_to_sparse_v2(features, weight, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int()) outidx, outidx_ = torch.chunk(outidx, 2, dim=1) if bias is not None: @@ -57,7 +57,7 @@ class SparseConvFunction(Function): # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset, + weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features) @@ -75,7 +75,7 @@ class SparseInverseConvFunction(Function): groups, bias) -> torch.Tensor: device = features.device # calculate the index pair - out_features, outidx_pair, ouidx_offset = ads_c.npu_sparse_inverse_conv3d(features, indices, weight, + out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(features, indices, weight, kernel_size, stride, padding, dilation, output_padding, out_channels, out_spatial_shape, batch_size) # sort and nonezero @@ -86,7 +86,7 @@ class SparseInverseConvFunction(Function): sub_result = new_sorted_idx - new_sorted_idx_2 unique_indices_offset = torch.nonzero(sub_result != 0) # matmul - out_features, outidx = ads_c.multi_to_sparse(out_features, unique_indices_offset.int(), + out_features, outidx = mx_driving._C.multi_to_sparse(out_features, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int()) outidx, outidx_ = torch.chunk(outidx, 2, dim=1) if bias is not None: @@ -99,7 +99,7 @@ class SparseInverseConvFunction(Function): # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset, + weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features) return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None @@ -119,7 +119,7 @@ class SubMConvFunction(Function): # calculate the index pair hh = indices[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2] + \ indices[:, 1] * out_spatial_shape[1] * out_spatial_shape[2] + indices[:, 2] * out_spatial_shape[2] + indices[:, 3] - temp, hh2 = ads_c.npu_prepare_subm_conv3d(hh, out_spatial_shape, batch_size) + temp, hh2 = mx_driving._C.npu_prepare_subm_conv3d(hh, out_spatial_shape, batch_size) temp[hh] = hh2 # pad the feature and weight become align feature_align = features.shape[1] % 8 @@ -128,7 +128,7 @@ class SubMConvFunction(Function): zero_tensor = torch.zeros((kernel_size[0], kernel_size[0], kernel_size[0], 8 - feature_align, out_channels)).to(device) weight_pad = torch.cat((weight, zero_tensor), 3) # calculate the out_feature - out_features, outidx_pair, ouidx_offset = ads_c.npu_subm_sparse_conv3d(features, indices, weight_pad, + out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d(features, indices, weight_pad, kernel_size, out_channels, out_spatial_shape, batch_size, temp) @@ -148,7 +148,7 @@ class SubMConvFunction(Function): # 'pylint: disable=too-many-arguments,huawei-too-many-arguments def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple: features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset, + weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features) return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None diff --git a/setup.py b/setup.py index 7850aae5..dd3f6ebd 100644 --- a/setup.py +++ b/setup.py @@ -1,43 +1,179 @@ -import glob +import multiprocessing import os +import platform +import stat import subprocess +import sys from pathlib import Path from typing import Union import torch -from setuptools import find_packages, setup -from torch.utils.cpp_extension import BuildExtension - -from utils import extension +from setuptools import Extension, find_packages, setup +from setuptools._distutils.version import LooseVersion +from setuptools.command.build_clib import build_clib +from setuptools.command.build_ext import build_ext +from setuptools.command.develop import develop BASE_DIR = os.path.dirname(os.path.realpath(__file__)) -VERSION = torch.__version__ -full_components = ["common", "preprocess", "fused", "point", "detection", "spconv"] -source_file = glob.glob(os.path.join("./bind/", "*.cpp")) -include_dirs = [os.path.join(BASE_DIR, "include")] -for part in full_components: - source_file += glob.glob(os.path.join(f"./mx_driving/{part}/ops/csrc/", "*.cpp")) - -exts = [] -ext1 = extension.NpuExtension( - name="ads_c", - sources=source_file, - include_dirs=include_dirs, - extra_compile_args=[ - '-D__FILENAME__="$$(notdir $$(abspath $$<))"', - "-fprofile-arcs", - "-ftest-coverage", - "-fPIC", - "-fstack-protector-all", - ], - extra_link_args=[ - "-Wl,-z,relro", - "-Wl,-z,now", - "-s" - ], - libraries=["gcov"], -) -exts.append(ext1) +VERSION = "1.0.0" + + +def which(thefile): + path = os.environ.get("PATH", os.defpath).split(os.pathsep) + for d in path: + fname = os.path.join(d, thefile) + fnames = [fname] + if sys.platform == "win32": + exts = os.environ.get("PATHEXT", "").split(os.pathsep) + fnames += [fname + ext for ext in exts] + for name in fnames: + if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name): + return name + return None + + +def get_cmake_command(): + def _get_version(cmd): + for line in subprocess.check_output([cmd, "--version"]).decode("utf-8").split("\n"): + if "version" in line: + return LooseVersion(line.strip().split(" ")[2]) + raise RuntimeError("no version found") + + "Returns cmake command." + cmake_command = "cmake" + if platform.system() == "Windows": + return cmake_command + cmake3 = which("cmake3") + cmake = which("cmake") + if cmake3 is not None and _get_version(cmake3) >= LooseVersion("3.19.0"): + cmake_command = "cmake3" + return cmake_command + elif cmake is not None and _get_version(cmake) >= LooseVersion("3.19.0"): + return cmake_command + else: + raise RuntimeError("no cmake or cmake3 with version >= 3.19.0 found") + + +def get_build_type(): + build_type = "Release" + if os.getenv("DEBUG", default="0").upper() in ["ON", "1", "YES", "TRUE", "Y"]: + build_type = "Debug" + + if os.getenv("REL_WITH_DEB_INFO", default="0").upper() in ["ON", "1", "YES", "TRUE", "Y"]: + build_type = "RelWithDebInfo" + + return build_type + + +class CPPLibBuild(build_clib): + def initialize_options(self) -> None: + super().initialize_options() + self.kernel_name = None + + def run(self) -> None: + cmake = get_cmake_command() + if not cmake: + raise RuntimeError("CMake must be installed to build the libraries") + self.cmake = cmake + + build_py = self.get_finalized_command("build_py") + mx_driving_dir = os.path.join(BASE_DIR, build_py.build_lib, build_py.get_package_dir("mx_driving")) + if not os.path.exists(mx_driving_dir): + os.makedirs(mx_driving_dir) + + cmake_args = [ + "--preset=default", + "-DCMAKE_BUILD_TYPE=Release", + "-B", + self.build_temp, + f"-DMX_DRIVING_PATH={mx_driving_dir}", + f"-DKERNEL_NAME={self.kernel_name if self.kernel_name else '*'}", + ] + build_args = ["--build", self.build_temp, f"-j{multiprocessing.cpu_count()}"] + + for stage in range(2): + subprocess.check_call( + [self.cmake, BASE_DIR] + cmake_args + ["-DBUILD_STAGE=" + str(stage)], + cwd=BASE_DIR, + env=os.environ, + ) + subprocess.check_call( + [self.cmake] + build_args, + cwd=BASE_DIR, + env=os.environ, + ) + + +class ExtBuild(build_ext): + def run(self) -> None: + cmake = get_cmake_command() + if not cmake: + raise RuntimeError("CMake must be installed to build the libraries") + self.cmake = cmake + + build_py = self.get_finalized_command("build_py") + mx_driving_dir = os.path.join(BASE_DIR, build_py.build_lib, build_py.get_package_dir("mx_driving")) + if not os.path.exists(mx_driving_dir): + os.makedirs(mx_driving_dir) + + ext_cxx_flags = ["-std=c++17"] + for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]: + val = getattr(torch._C, f"_PYBIND11_{name}") + if val: + ext_cxx_flags.append(f"-D_PYBIND11_{name}={val}") + + cmake_args = [ + "--preset=default", + "-DCMAKE_BUILD_TYPE=Release", + "-B", + self.build_temp, + f"-DMX_DRIVING_PATH={mx_driving_dir}", + f"-DEXT_CXX_FLAGS={' '.join(ext_cxx_flags)}", + f"-DPython3_EXECUTABLE={sys.executable}", + ] + if LooseVersion(torch.__version__) < LooseVersion("2.1.0"): + cmake_args.append("-DCOMPILE_WITH_XLA:BOOL=ON") + build_args = ["--build", self.build_temp, f"-j{multiprocessing.cpu_count()}"] + + subprocess.check_call( + [self.cmake, BASE_DIR] + cmake_args + ["-DBUILD_STAGE=2"], + cwd=BASE_DIR, + env=os.environ, + ) + subprocess.check_call( + [self.cmake] + build_args, + cwd=BASE_DIR, + env=os.environ, + ) + + +class DevelopBuild(develop): + user_options = develop.user_options + [("kernel-name=", None, "Build the single kernel with the specified name")] + + def initialize_options(self) -> None: + super().initialize_options() + self.kernel_name = None + + def install_for_development(self) -> None: + self.reinitialize_command("build_py", build_lib="") + self.reinitialize_command("build_clib", kernel_name=self.kernel_name) + + if self.kernel_name: + self.run_command("build_clib") + return + + self.run_command("egg_info") + self.run_command("build_clib") + self.run_command("build_ext") + + if not self.dry_run: + with os.fdopen( + os.open(self.egg_link, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR), + "w", + encoding="utf-8", + ) as f: + f.write(self.egg_path + "\n" + self.setup_path) + self.process_distribution(None, self.dist, not self.no_deps) def get_sha(pytorch_root: Union[str, Path]) -> str: @@ -49,9 +185,7 @@ def get_sha(pytorch_root: Union[str, Path]) -> str: return "Unknown" -VERSION = "1.0.0" -ads_root = Path(__file__).parent -sha = get_sha(ads_root) +sha = get_sha(BASE_DIR) if not os.getenv("BUILD_WITHOUT_SHA"): VERSION += "+git" + sha[:7] @@ -60,9 +194,14 @@ setup( version=VERSION, description="A Library of acceleration for autonomous driving systems on Ascend-NPU.", keywords="mx_driving", - ext_modules=exts, + ext_modules=[Extension("mx_driving._C", sources=[])], author="Ascend Contributors", - cmdclass={"build_ext": BuildExtension}, + libraries=[("mx_driving", {"sources": []})], + cmdclass={ + "build_clib": CPPLibBuild, + "build_ext": ExtBuild, + "develop": DevelopBuild, + }, packages=find_packages(), include_package_data=True, ) diff --git a/tests/torch/test_bev_pool_v2.py b/tests/torch/test_bev_pool_v2.py index 9870e4ff..ca27c9f5 100644 --- a/tests/torch/test_bev_pool_v2.py +++ b/tests/torch/test_bev_pool_v2.py @@ -3,9 +3,9 @@ import unittest import numpy as np import torch import torch_npu -from ads_c import npu_bev_pool_v2_backward from torch_npu.testing.testcase import TestCase, run_tests +from mx_driving._C import npu_bev_pool_v2_backward from mx_driving.point import bev_pool_v2 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -13,7 +13,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] # pylint: disable=too-many-arguments,huawei-too-many-arguments def golden_bev_pool_v2( - depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c + depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c ): output = np.zeros((b, d, h, w, c), dtype=np.float32) depth = depth.flatten() @@ -29,7 +29,7 @@ def golden_bev_pool_v2( # pylint: disable=too-many-arguments,huawei-too-many-arguments def golden_bev_pool_v2_grad( - grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c + grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c ): grad_depth = np.zeros_like(depth).flatten() grad_feat = np.zeros_like(feat).reshape((-1, c)) @@ -63,10 +63,18 @@ class TestBEVPoolV2(TestCase): @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `bev_pool` is only supported on 910B, skip this ut!") def test_bev_pool_v2(self): - shapes = [[1, 1, 1, 1, 1, 1], [3, 3, 3, 3, 3, 3], [3, 3, 15, 15, 17, 33], [1, 5, 128, 128, 31, 777], [32, 4, 128, 128, 64, 9999]] + shapes = [ + [1, 1, 1, 1, 1, 1], + [3, 3, 3, 3, 3, 3], + [3, 3, 15, 15, 17, 33], + [1, 5, 128, 128, 31, 777], + [32, 4, 128, 128, 64, 9999], + ] for shape in shapes: B, D, H, W, C, N_RANKS = shape - feat, depth, grad_out, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape = generate_bev_pool_data(B, D, H, W, C, N_RANKS) + feat, depth, grad_out, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape = generate_bev_pool_data( + B, D, H, W, C, N_RANKS + ) kept = np.ones(ranks_bev.shape[0], dtype=bool) kept[1:] = ranks_feat[1:] != ranks_feat[:-1] interval_starts = np.where(kept)[0].astype(np.int32) @@ -111,7 +119,19 @@ class TestBEVPoolV2(TestCase): W, ) grad_feat = golden_bev_pool_v2_grad( - grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, B, D, H, W, C + grad_out, + depth, + feat, + ranks_depth, + ranks_feat, + ranks_bev, + interval_starts, + interval_lengths, + B, + D, + H, + W, + C, ) self.assertRtolEqual(bev_feat.detach().cpu().numpy(), bev_feat_cpu) self.assertRtolEqual(grad_feat_npu.cpu().numpy(), grad_feat) diff --git a/tests/torch/test_furthest_point_sampling.py b/tests/torch/test_furthest_point_sampling.py index a637d8eb..5bb718e1 100644 --- a/tests/torch/test_furthest_point_sampling.py +++ b/tests/torch/test_furthest_point_sampling.py @@ -155,8 +155,7 @@ class TestFurthestPointSample(TestCase): self.compare_res(test1) self.compare_res(test2) self.compare_res(test3) - self.compare_res(test4) if __name__ == "__main__": - run_tests() \ No newline at end of file + run_tests() diff --git a/tests/torch/test_group_points_grad.py b/tests/torch/test_group_points_grad.py index 2dd7d3b9..8bf61d3e 100644 --- a/tests/torch/test_group_points_grad.py +++ b/tests/torch/test_group_points_grad.py @@ -4,8 +4,7 @@ import numpy as np import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c -import mx_driving +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -50,7 +49,7 @@ class TestGroupPointsGrad(TestCase): golden_grad_features = self.golden_group_points_grad( np_grad_out, np_indices, np_grad_features, B, npoints, nsample) - npu_grad_features = ads_c.group_points_backward(torch_grad_out, torch_indices, B, C, N, + npu_grad_features = mx_driving._C.group_points_backward(torch_grad_out, torch_indices, B, C, N, npoints, nsample) self.assertRtolEqual(golden_grad_features, npu_grad_features.cpu().numpy()) diff --git a/tests/torch/test_hard_voxelize.py b/tests/torch/test_hard_voxelize.py index de10230e..b58dc41b 100644 --- a/tests/torch/test_hard_voxelize.py +++ b/tests/torch/test_hard_voxelize.py @@ -4,7 +4,7 @@ import numpy as np import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -22,7 +22,7 @@ class TestHardVoxelize(TestCase): def npu_hard_voxelize(self, points): points_npu = torch.from_numpy(points.astype(np.float32)).npu() - cnt, pts, voxs, num_per_vox = ads_c.hard_voxelize( + cnt, pts, voxs, num_per_vox = mx_driving._C.hard_voxelize( points_npu, [0.075, 0.075, 0.2], [-54, -54, -5, 54, 54, 5], 10, 1000 ) return cnt, voxs.cpu().numpy() diff --git a/tests/torch/test_npu_dynamic_scatter.py b/tests/torch/test_npu_dynamic_scatter.py index bca00da3..eb6a15d1 100644 --- a/tests/torch/test_npu_dynamic_scatter.py +++ b/tests/torch/test_npu_dynamic_scatter.py @@ -6,7 +6,7 @@ import torch_npu from torch_npu.testing.testcase import TestCase, run_tests from torch_npu.testing.common_utils import create_common_tensor -import ads_c +import mx_driving._C import mx_driving.point @@ -119,7 +119,7 @@ class TestDynamicScatter(TestCase): self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(), argsort_coor.cpu(), compare_mask.cpu()], reduce_type) - ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), + mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type) self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy()) @@ -141,7 +141,7 @@ class TestDynamicScatter(TestCase): self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(), argsort_coor.cpu(), compare_mask.cpu()], reduce_type) - ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), + mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type) self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy()) @@ -163,7 +163,7 @@ class TestDynamicScatter(TestCase): self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(), argsort_coor.cpu(), compare_mask.cpu()], reduce_type) - ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), + mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type) self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy()) diff --git a/tests/torch/test_point_to_voxel.py b/tests/torch/test_point_to_voxel.py index c01d134d..ce8a0578 100644 --- a/tests/torch/test_point_to_voxel.py +++ b/tests/torch/test_point_to_voxel.py @@ -4,7 +4,7 @@ import numpy as np import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -32,7 +32,7 @@ class TestPointToVoxel(TestCase): def npu_encode(self, coords): coords_npu = torch.from_numpy(coords.view(np.float32)).npu() - return ads_c.point_to_voxel(coords_npu, [], [], "XYZ").cpu().numpy().view(np.int32) + return mx_driving._C.point_to_voxel(coords_npu, [], [], "XYZ").cpu().numpy().view(np.int32) @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!") def test_point_to_voxel(self): diff --git a/tests/torch/test_unique_voxel.py b/tests/torch/test_unique_voxel.py index d49cbe68..ba0e36b5 100644 --- a/tests/torch/test_unique_voxel.py +++ b/tests/torch/test_unique_voxel.py @@ -4,7 +4,7 @@ import numpy as np import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -24,7 +24,7 @@ class TestUniqueVoxel(TestCase): def npu_unique(self, voxels): voxels_npu = torch.from_numpy(voxels).npu() - cnt, uni_vox, _, _, _ = ads_c.unique_voxel(voxels_npu) + cnt, uni_vox, _, _, _ = mx_driving._C.unique_voxel(voxels_npu) return cnt, uni_vox.cpu().numpy() def gen_integration(self, point_num): @@ -55,9 +55,9 @@ class TestUniqueVoxel(TestCase): def npu_integration(self, coords): coords_npu = torch.from_numpy(coords.view(np.float32)).npu() - voxels_npu = ads_c.point_to_voxel(coords_npu, [], [], "XYZ") - cnt, uni_vox, _, _, _ = ads_c.unique_voxel(voxels_npu) - dec = ads_c.voxel_to_point(uni_vox, [], [], "XYZ") + voxels_npu = mx_driving._C.point_to_voxel(coords_npu, [], [], "XYZ") + cnt, uni_vox, _, _, _ = mx_driving._C.unique_voxel(voxels_npu) + dec = mx_driving._C.voxel_to_point(uni_vox, [], [], "XYZ") return cnt, dec.cpu().numpy() @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!") diff --git a/tests/torch/test_vec_pool_backward.py b/tests/torch/test_vec_pool_backward.py index 2125ba11..ce128a21 100644 --- a/tests/torch/test_vec_pool_backward.py +++ b/tests/torch/test_vec_pool_backward.py @@ -4,7 +4,7 @@ import numpy as np import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -60,7 +60,7 @@ class TestVecPoolGrad(TestCase): golden_grad_support_features = self.golden_vec_pool_backward( np_grad_new_features, np_point_cnt_of_grid, np_grouped_idxs, np_grad_support_features) - real_grad_support_features = ads_c.vec_pool_backward( + real_grad_support_features = mx_driving._C.vec_pool_backward( torch_grad_new_features, torch_point_cnt_of_grid, torch_grouped_idxs, n, c_in) self.assertRtolEqual(golden_grad_support_features, real_grad_support_features.cpu().numpy()) diff --git a/tests/torch/test_voxel_pooling_train.py b/tests/torch/test_voxel_pooling_train.py index ac372802..e713523d 100644 --- a/tests/torch/test_voxel_pooling_train.py +++ b/tests/torch/test_voxel_pooling_train.py @@ -1,25 +1,26 @@ -import unittest import copy +import unittest + +import numpy as np import torch import torch_npu - from torch_npu.testing.testcase import TestCase, run_tests -import ads_c -import numpy as np + import mx_driving.point DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] # pylint: disable=too-many-arguments,huawei-too-many-arguments -def voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, num_voxel_x, - num_voxel_y, num_voxel_z, geom_xyz, input_features): +def voxel_pooling_train_cpu_forward( + batch_size, num_points, num_channels, num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, input_features +): dtype = input_features.dtype pos_memo = torch.zeros((batch_size, num_points, 3), dtype=torch.int32) * -1 output_features = torch.zeros((batch_size, num_voxel_y, num_voxel_x, num_channels), dtype=dtype) for i in range(batch_size): for j in range(num_points): - + sample_x = geom_xyz[i][j][0] sample_y = geom_xyz[i][j][1] sample_z = geom_xyz[i][j][2] @@ -29,7 +30,7 @@ def voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, num_vo if sample_y < 0 or sample_y >= num_voxel_y: continue if sample_z < 0 or sample_z >= num_voxel_z: - continue + continue pos_memo[i][j][0] = i pos_memo[i][j][1] = geom_xyz[i][j][1] @@ -44,14 +45,11 @@ def voxel_pooling_train_cpu_backward(pos, result_cpu, grad_features): features_shape = grad_features.shape mask = (pos != -1)[..., 0] - grad_features = grad_features.reshape( - grad_features.shape[0], -1, grad_features.shape[-1]) + grad_features = grad_features.reshape(grad_features.shape[0], -1, grad_features.shape[-1]) - grad_features[mask] = result_cpu[pos[mask][..., 0].long( - ), :, pos[mask][..., 1].long(), pos[mask][..., 2].long()] + grad_features[mask] = result_cpu[pos[mask][..., 0].long(), :, pos[mask][..., 1].long(), pos[mask][..., 2].long()] - grad_features = grad_features.reshape( - features_shape) + grad_features = grad_features.reshape(features_shape) return grad_features @@ -60,24 +58,23 @@ class TestVoxelPoolingTrain(TestCase): batch_size = input_features.shape[0] num_points = input_features.shape[1] num_channels = input_features.shape[2] - pos, result = voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, voxel_num[0], - voxel_num[1], voxel_num[2], geom_xyz, input_features) + pos, result = voxel_pooling_train_cpu_forward( + batch_size, num_points, num_channels, voxel_num[0], voxel_num[1], voxel_num[2], geom_xyz, input_features + ) pos_memo = pos grad_features_cpu = torch.zeros_like(input_features) - grad_features_cpu = voxel_pooling_train_cpu_backward( - pos_memo, result, grad_features_cpu) + grad_features_cpu = voxel_pooling_train_cpu_backward(pos_memo, result, grad_features_cpu) return pos, result, grad_features_cpu def npu_to_exec(self, geom_xyz, input_features, voxel_num): - result = mx_driving.point.npu_voxel_pooling_train( - geom_xyz, input_features, voxel_num) + result = mx_driving.point.npu_voxel_pooling_train(geom_xyz, input_features, voxel_num) result.backward(result) grad_features_npu = input_features.grad return result, grad_features_npu - + def gen_data(self, geom_shape, feature_shape, coeff, batch_size, num_channels, dtype): geom_xyz = torch.rand(geom_shape) * coeff geom_xyz = geom_xyz.reshape(batch_size, -1, 3) @@ -89,18 +86,16 @@ class TestVoxelPoolingTrain(TestCase): features_npu = features_cpu.npu() features_npu.requires_grad = True return geom_xyz_cpu, features_cpu, geom_xyz_npu, features_npu - - @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `VoxelPoolingTrain` is only supported on 910B, skip this ut!") + + @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `VoxelPoolingTrain` is only supported on 910B, skip this ut!") def test_voxel_pooling_train(self): - torch.npu.set_device('npu:0') - types = [torch.float32, ] + torch.npu.set_device("npu:0") + types = [ + torch.float32, + ] batch_size_list = [1, 2] num_channels_list = [32, 80] - shape_list = [ - [30, 25], - [25, 12, 40], - [20] - ] + shape_list = [[30, 25], [25, 12, 40], [20]] coeff = 90 voxel_num = [128, 128, 1] # test @@ -114,18 +109,19 @@ class TestVoxelPoolingTrain(TestCase): feature_shape.append(num_channel) geom_shape.append(3) geom_cpu, feature_cpu, geom_npu, feature_npu = self.gen_data( - geom_shape, feature_shape, coeff, batch_size, num_channel, dtype) + geom_shape, feature_shape, coeff, batch_size, num_channel, dtype + ) pos, cpu_result, cpu_grad_features = self.cpu_to_exec(geom_cpu, feature_cpu, voxel_num) npu_result, npu_grad_features = self.npu_to_exec(geom_npu, feature_npu, voxel_num) - + cpu_result = cpu_result.numpy() npu_result = npu_result.detach().cpu().numpy() self.assertRtolEqual(cpu_result, npu_result) - + cpu_grad_features = cpu_grad_features.numpy() npu_grad_features = npu_grad_features.cpu().numpy() self.assertRtolEqual(cpu_grad_features, npu_grad_features) if __name__ == "__main__": - run_tests() \ No newline at end of file + run_tests() diff --git a/tests/torch/test_voxel_to_point.py b/tests/torch/test_voxel_to_point.py index 9dc1d0c6..35b21bff 100644 --- a/tests/torch/test_voxel_to_point.py +++ b/tests/torch/test_voxel_to_point.py @@ -4,7 +4,7 @@ import numpy as np import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests -import ads_c +import mx_driving._C DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] @@ -29,7 +29,7 @@ class TestVoxelToPoint(TestCase): def npu_decode(self, voxels): voxels_npu = torch.from_numpy(voxels.view(np.int32)).npu() - return ads_c.voxel_to_point(voxels_npu, [], [], "XYZ").cpu().numpy() + return mx_driving._C.voxel_to_point(voxels_npu, [], [], "XYZ").cpu().numpy() @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!") def test_point_to_voxel(self): diff --git a/utils/extension.py b/utils/extension.py deleted file mode 100644 index 57908424..00000000 --- a/utils/extension.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2022 Huawei Technologies Co., Ltd -# All rights reserved. -# -# Licensed under the BSD 3-Clause License (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://opensource.org/licenses/BSD-3-Clause -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import site -from pkg_resources import parse_version - -import setuptools -import torch -import torch.utils.cpp_extension as TorchExtension - -try: - import torch_npu - PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.realpath(torch_npu.__file__)) -except: - site_packages_path = site.getsitepackages() - PYTORCH_NPU_INSTALL_PATH = site_packages_path[0] + "/torch_npu/" - - -def NpuExtension(name, sources, *args, **kwargs): - r''' - Creates a :class:`setuptools.Extension` for C++. - - Convenience method that creates a :class:`setuptools.Extension` with the - bare minimum (but often sufficient) arguments to build a C++ extension. - - All arguments are forwarded to the :class:`setuptools.Extension` - constructor. - - Example: - >>> from setuptools import setup - >>> from torch_npu.utils.cpp_extension import NpuExtension - >>> setup( - name='extension', - ext_modules=[ - NpuExtension( - name='extension', - sources=['extension.cpp'], - extra_compile_args=['-g']), - ], - cmdclass={ - 'build_ext': BuildExtension - }) - ''' - - torch_npu_dir = PYTORCH_NPU_INSTALL_PATH - include_dirs = kwargs.get('include_dirs', []) - include_dirs.append(os.path.join(torch_npu_dir, 'include')) - include_dirs += TorchExtension.include_paths() - kwargs['include_dirs'] = include_dirs - - library_dirs = kwargs.get('library_dirs', []) - library_dirs.append(os.path.join(torch_npu_dir, 'lib')) - library_dirs += TorchExtension.library_paths() - kwargs['library_dirs'] = library_dirs - - libraries = kwargs.get('libraries', []) - libraries.append('c10') - libraries.append('torch') - libraries.append('torch_cpu') - libraries.append('torch_python') - libraries.append('torch_npu') - kwargs['libraries'] = libraries - - kwargs['language'] = 'c++' - - define_macros = [] - if parse_version(torch.__version__) < parse_version('2.1.0'): - define_macros += [('COMPILE_WITH_XLA', None)] - kwargs['define_macros'] = define_macros - - return setuptools.Extension(name, sources, *args, **kwargs) -- Gitee