From 4dea4cbce6a383176a492d99cd82a27b61540d28 Mon Sep 17 00:00:00 2001
From: chenmingkai <chenmingkai3@huawei.com>
Date: Tue, 15 Oct 2024 11:32:06 +0800
Subject: [PATCH] Optimize build pipeline

---
 .gitignore                                    |   4 +-
 CMakeLists.txt                                | 223 +-----------------
 CMakePresets.json                             |   4 +-
 MANIFEST.in                                   |   1 -
 bind/CMakeLists.txt                           |   5 +
 bind/pybind.cpp                               |   1 -
 ci/build.sh                                   |  38 +--
 cmake/config.cmake                            |   6 +-
 cmake/func.cmake                              | 136 +++++------
 cmake/stage_0.cmake                           |  11 +
 cmake/stage_1.cmake                           | 212 +++++++++++++++++
 cmake/stage_2.cmake                           |  48 ++++
 cmake/util/ascendc_bin_param_build.py         |   2 +
 cmake/util/const_var.py                       |   1 +
 docs/api/README.md                            |   4 +-
 include/csrc/pybind.h                         |   2 +-
 .../bevformer/dense_heads/panoseg_occ_head.py |   8 +-
 mx_driving/__init__.py                        |   4 +-
 mx_driving/common/CMakeLists.txt              |   7 +
 mx_driving/common/ops/assign_score_withk.py   |  11 +-
 mx_driving/common/ops/csrc/CMakeLists.txt     |   5 +
 mx_driving/common/ops/hypot.py                |  14 +-
 mx_driving/common/ops/knn.py                  |   6 +-
 mx_driving/common/ops/npu_hypot.py            |  18 ++
 .../common/ops/npu_scatter_mean_grad.py       |   4 +-
 mx_driving/common/ops/scatter_max.py          |   6 +-
 mx_driving/common/ops/scatter_mean.py         |   4 +-
 mx_driving/common/ops/sort_pairs.py           |   6 +-
 mx_driving/common/ops/threeNN.py              |   6 +-
 mx_driving/common/ops/three_interpolate.py    |   6 +-
 mx_driving/detection/CMakeLists.txt           |   6 +-
 mx_driving/detection/ops/border_align.py      |  35 ++-
 mx_driving/detection/ops/box_iou.py           |  21 +-
 mx_driving/detection/ops/boxes_overlap_bev.py |   5 +-
 mx_driving/detection/ops/csrc/CMakeLists.txt  |   5 +
 mx_driving/detection/ops/nms3d_normal.py      |   4 +-
 mx_driving/detection/ops/npu_nms3d.py         |   4 +-
 mx_driving/detection/ops/roi_align_rotated.py |   6 +-
 mx_driving/detection/ops/rotated_iou.py       |   4 +-
 mx_driving/detection/ops/rotated_overlaps.py  |   4 +-
 mx_driving/fused/CMakeLists.txt               |   6 +-
 mx_driving/fused/ops/csrc/CMakeLists.txt      |   5 +
 mx_driving/fused/ops/deform_conv2d.py         |   6 +-
 mx_driving/fused/ops/fused_bias_leaky_relu.py |   4 +-
 .../fused/ops/modulated_deform_conv2d.py      |   6 +-
 mx_driving/fused/ops/npu_add_relu.py          |   6 +-
 .../fused/ops/npu_deformable_aggregation.py   |   6 +-
 mx_driving/fused/ops/npu_max_pool2d.py        |   4 +-
 ...pu_multi_scale_deformable_attn_function.py |   6 +-
 mx_driving/point/CMakeLists.txt               |   8 +
 mx_driving/point/ops/bev_pool.py              |   7 +-
 mx_driving/point/ops/bev_pool_v2.py           |  41 ++--
 mx_driving/point/ops/csrc/CMakeLists.txt      |   5 +
 .../point/ops/furthest_point_sampling.py      |   4 +-
 .../ops/furthest_point_sampling_with_dist.py  |   4 +-
 mx_driving/point/ops/group_points.py          |   6 +-
 mx_driving/point/ops/npu_dynamic_scatter.py   |  12 +-
 mx_driving/point/ops/voxel_pooling_train.py   |   6 +-
 mx_driving/point/ops/voxelization.py          |   6 +-
 mx_driving/preprocess/CMakeLists.txt          |   6 +-
 mx_driving/preprocess/ops/csrc/CMakeLists.txt |   5 +
 .../preprocess/ops/npu_points_in_box.py       |   4 +-
 .../preprocess/ops/npu_points_in_box_all.py   |   4 +-
 .../preprocess/ops/npu_roipoint_pool3d.py     |   4 +-
 mx_driving/spconv/CMakeLists.txt              |   8 +
 mx_driving/spconv/ops/csrc/CMakeLists.txt     |   5 +
 mx_driving/spconv/ops/sparse_functional.py    |  20 +-
 setup.py                                      | 213 ++++++++++++++---
 tests/torch/test_bev_pool_v2.py               |  32 ++-
 tests/torch/test_furthest_point_sampling.py   |   3 +-
 tests/torch/test_group_points_grad.py         |   5 +-
 tests/torch/test_hard_voxelize.py             |   4 +-
 tests/torch/test_npu_dynamic_scatter.py       |   8 +-
 tests/torch/test_point_to_voxel.py            |   4 +-
 tests/torch/test_unique_voxel.py              |  10 +-
 tests/torch/test_vec_pool_backward.py         |   4 +-
 tests/torch/test_voxel_pooling_train.py       |  64 +++--
 tests/torch/test_voxel_to_point.py            |   4 +-
 utils/extension.py                            |  85 -------
 79 files changed, 869 insertions(+), 668 deletions(-)
 delete mode 100644 MANIFEST.in
 create mode 100644 bind/CMakeLists.txt
 create mode 100644 cmake/stage_0.cmake
 create mode 100644 cmake/stage_1.cmake
 create mode 100644 cmake/stage_2.cmake
 create mode 100644 mx_driving/common/ops/csrc/CMakeLists.txt
 create mode 100644 mx_driving/common/ops/npu_hypot.py
 create mode 100644 mx_driving/detection/ops/csrc/CMakeLists.txt
 create mode 100644 mx_driving/fused/ops/csrc/CMakeLists.txt
 create mode 100644 mx_driving/point/ops/csrc/CMakeLists.txt
 create mode 100644 mx_driving/preprocess/ops/csrc/CMakeLists.txt
 create mode 100644 mx_driving/spconv/ops/csrc/CMakeLists.txt
 delete mode 100644 utils/extension.py

diff --git a/.gitignore b/.gitignore
index 414718b3..35e70a2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 __pycache__/
 .DS_Store
 .idea
-cmake-build-debug
\ No newline at end of file
+cmake-build-debug
+build
+*.egg-info/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7ee7922..f58f410e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,11 @@
-cmake_minimum_required(VERSION 3.16.0)
-project(opp)
-set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+cmake_minimum_required(VERSION 3.19.0)
+project(mx_driving)
 
 include(cmake/config.cmake)
 include(cmake/func.cmake)
 include(cmake/intf.cmake)
 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/bind)
 set(MX_DRIVING_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mx_driving)
 add_subdirectory(${MX_DRIVING_DIR}/common)
 add_subdirectory(${MX_DRIVING_DIR}/preprocess)
@@ -14,215 +14,10 @@ add_subdirectory(${MX_DRIVING_DIR}/point)
 add_subdirectory(${MX_DRIVING_DIR}/detection)
 add_subdirectory(${MX_DRIVING_DIR}/spconv)
 
-opbuild(OPS_SRC ${ASCEND_HOST_SRC} OUT_DIR ${ASCEND_AUTOGEN_PATH})
-
-add_library(cust_op_proto SHARED ${ASCEND_HOST_SRC}
-                                 ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
-target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
-target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden)
-target_link_libraries(
-  cust_op_proto
-  PRIVATE intf_pub
-          exe_graph
-          register
-          tiling_api
-          -Wl,--whole-archive
-          rt2_registry
-          -Wl,--no-whole-archive)
-set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0)
-install_target(
-  TRG cust_op_proto DST
-  packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
-install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST
-             packages/vendors/${vendor_name}/op_proto/inc)
-
-add_library(cust_optiling SHARED ${ASCEND_HOST_SRC})
-target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
-target_compile_options(cust_optiling PRIVATE -fvisibility=hidden)
-target_link_libraries(
-  cust_optiling
-  PRIVATE intf_pub
-          exe_graph
-          register
-          tiling_api
-          -Wl,--whole-archive
-          rt2_registry
-          -Wl,--no-whole-archive)
-set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0)
-install_target(
-  TRG
-  cust_optiling
-  DST
-  packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}
-)
-# create liboptiling.so link
-add_custom_command(
-  TARGET cust_optiling
-  POST_BUILD
-  COMMAND
-    ${CMAKE_COMMAND} -E chdir
-    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling
-    ${CMAKE_COMMAND} -E create_symlink
-    lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
-    liboptiling.so)
-install(
-  FILES
-    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so
-  DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
-
-if(${ENABLE_ONNX})
-  if(CANN_PATHS)
-    if(${ARCH} STREQUAL "aarch64")
-      protobuf_generate(
-        PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto
-        OUT_DIR ${ASCEND_AUTOGEN_PATH})
-    else()
-      protobuf_generate(
-        PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto
-        OUT_DIR ${ASCEND_AUTOGEN_PATH})
-    endif()
-  else()
-    protobuf_generate(
-      PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto
-      OUT_DIR ${ASCEND_AUTOGEN_PATH})
-  endif()
-
-  add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC})
-  target_compile_options(
-    cust_onnx_parsers
-    PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private
-            "-fno-common" "-fno-strict-aliasing")
-  target_link_libraries(cust_onnx_parsers PRIVATE intf_pub)
-  target_include_directories(
-    cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include
-                              ${ASCEND_AUTOGEN_PATH})
-
-  install_target(TRG cust_onnx_parsers DST
-                 packages/vendors/${vendor_name}/framework/onnx/)
-endif()
-
-# ===================Build ACLNN===================
-file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
-file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
-set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM})
-set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM})
-add_library(cust_opapi SHARED ${ACLNN_SRC})
-target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi)
-install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib)
-install_file(TRG cust_opapi SRC ${ACLNN_INC} DST
-             packages/vendors/${vendor_name}/op_api/include)
-
-# ===================Build Kernel===================
-# set custom compile options
-if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
-  add_ops_compile_options(ALL OPTIONS -g -O0)
-endif()
-
-file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH})
-
-foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
-  if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini)
-    # generate aic-${compute_unit}-ops-info.json
-    add_ops_info_target(
-      TARGET
-      ops_info_gen_${compute_unit}
-      OUTPUT
-      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json
-      OPS_INFO
-      ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-      INSTALL_DIR
-      packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
-    )
-
-    # generate ascendc impl py once
-    if(NOT TARGET ascendc_impl_gen)
-      add_ops_impl_target(
-        TARGET
-        ascendc_impl_gen
-        OPS_INFO
-        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-        IMPL_DIR
-        ${ASCEND_KERNEL_PATH}
-        OUT_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
-      )
-      install_file(
-        TRG
-        ascendc_impl_gen
-        SRC
-        ${ASCEND_KERNEL_SRC}
-        DST
-        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
-      )
-    endif()
-
-    # dynamic shape binary compile
-    if(${ENABLE_BINARY_PACKAGE})
-      add_bin_compile_target(
-        TARGET
-        ascendc_bin_${compute_unit}
-        OPS_INFO
-        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
-        IMPL_DIR
-        ${ASCEND_KERNEL_PATH}
-        ADP_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
-        OUT_DIR
-        ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
-        KERNEL_DIR
-        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
-        INSTALL_DIR
-        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
-        COMPUTE_UNIT
-        ${compute_unit})
-      add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen cust_optiling)
-    endif()
-  endif()
-endforeach()
-
-# generate npu_supported_ops.json
-add_npu_support_target(
-  TARGET
-  npu_supported_ops
-  OPS_INFO_DIR
-  ${ASCEND_AUTOGEN_PATH}
-  OUT_DIR
-  ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core
-  INSTALL_DIR
-  packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE})
-
-# ===================Build test===================
-# WARN: WIP
-if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
-  add_subdirectory(testcases)
-endif()
-
-get_system_info(SYSTEM_INFO)
-
-# gen version.info
-add_custom_target(
-  gen_version_info ALL
-  COMMAND
-    bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh
-    ${ASCEND_CANN_PACKAGE_PATH} ${MX_DRIVING_PATH}/packages/vendors/${vendor_name})
-
-install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info
-        DESTINATION packages/vendors/${vendor_name})
-
-if(COMPILE_OPP_PACKAGE)
-  # CPack config
-  set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
-  set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
-  set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
-  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
-  set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
-  set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
-  set(CPACK_GENERATOR External)
-  set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
-  set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
-  set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
-  set(CPACK_EXTERNAL_BUILT_PACKAGES
-      ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}
-  )
-  include(CPack)
+if(BUILD_STAGE EQUAL 0)
+  include(cmake/stage_0.cmake)
+elseif(BUILD_STAGE EQUAL 1)
+  include(cmake/stage_1.cmake)
+elseif(BUILD_STAGE EQUAL 2)
+  include(cmake/stage_2.cmake)
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index dd0b3d58..abe6215a 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -11,7 +11,7 @@
             "displayName": "Default Config",
             "description": "Default build using Unix Makefiles generator",
             "generator": "Unix Makefiles",
-            "binaryDir": "${sourceDir}/build_out",
+            "binaryDir": "${sourceDir}/build",
             "cacheVariables": {
                 "CMAKE_BUILD_TYPE": {
                     "type": "STRING",
@@ -43,7 +43,7 @@
                 },
                 "CMAKE_INSTALL_PREFIX": {
                     "type": "PATH",
-                    "value": "${sourceDir}/build_out"
+                    "value": "${sourceDir}/build"
                 },
                 "ENABLE_ONNX": {
                     "type": "BOOL",
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 3450ea38..00000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1 +0,0 @@
-recursive-include mx_driving/packages/ *
diff --git a/bind/CMakeLists.txt b/bind/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/bind/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/bind/pybind.cpp b/bind/pybind.cpp
index b12f0644..a227bee8 100644
--- a/bind/pybind.cpp
+++ b/bind/pybind.cpp
@@ -1,5 +1,4 @@
 #include "csrc/pybind.h"
-
 #include <torch/extension.h>
 
 #include <mutex>
diff --git a/ci/build.sh b/ci/build.sh
index 2dc41f8a..f0d8254e 100644
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -15,8 +15,7 @@ function check_python_version() {
             return 0
         fi
     done
-    if [ "${matched_py_version}" = 'false' ]; then
-        echo "${PY_VERSION} is an unsupported python version, we suggest ${SUPPORTED_PY_VERSION[*]}"
+    if [ "${matched_py_version}" = 'false' ]; then echo "${PY_VERSION} is an unsupported python version, we suggest ${SUPPORTED_PY_VERSION[*]}"
         exit 1
     fi
 }
@@ -67,40 +66,7 @@ function main()
     export BUILD_PYTHON_VERSION=${PY_VERSION}
     rm -rf ${BUILD_PACKAGES_DIR}
 
-    if [ "x${SINGLE_OP}" != "x" ]; then
-      if [ -z "$ASCEND_CUSTOM_OPP_PATH" ]; then
-        echo "ASCEND_CUSTOM_OPP_PATH is not set. Please set the path of the custom op kernel code."
-        exit 1
-      fi
-      bash ${SCRIPTS_DIR}/build_kernel.sh --single_op=${SINGLE_OP} --build_type=${BUILD_TYPE}
-
-      if [ $? != 0 ]; then
-          echo "Failed to compile the wheel file. Please check the source code by yourself."
-          exit 1
-      fi
-
-      echo "Successfully compiled the single op: ${SINGLE_OP}"
-      echo "copying the custom op kernel code to the custom opp path: ${ASCEND_CUSTOM_OPP_PATH}"
-      cp -ruf ${BUILD_PACKAGES_DIR}/vendors/customize/op_impl/ai_core/tbe/kernel/* ${ASCEND_CUSTOM_OPP_PATH}/op_impl/ai_core/tbe/kernel/
-      exit 0
-    else
-      bash ${SCRIPTS_DIR}/build_kernel.sh --build_type=${BUILD_TYPE}
-    fi
-
-    if [ $? != 0 ]; then
-        echo "Failed to compile the wheel file. Please check the source code by yourself."
-        exit 1
-    fi
-    cd ${CUR_DIR}/..
-    rm -rf build
-    if [ -d "mx_driving.egg-info" ]; then
-        echo "mx_driving.egg-info exist"
-        rm -rf mx_driving.egg-info
-    else
-        echo "mx_driving.egg-info not exist"
-    fi
-
-    python"${PY_VERSION}" setup.py build bdist_wheel
+    python"${PY_VERSION}" setup.py bdist_wheel
     if [ $? != 0 ]; then
         echo "Failed to compile the wheel file. Please check the source code by yourself."
         exit 1
diff --git a/cmake/config.cmake b/cmake/config.cmake
index d4fe96a2..eef45176 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -29,7 +29,7 @@ if(EXISTS ${ASCEND_PATH}/latest/compiler)
 endif()
 
 if("${CANN_PATHS}x" STREQUAL "x")
-  # read vertion from `latest/version.cfg`
+  # read version from `latest/version.cfg`
   file(READ "${ASCEND_PATH}/latest/version.cfg" ASCEND_VERSION_CFG)
   string(REGEX MATCH "(CANN-[0-9]\.[0-9]+)\]\n$" _ ${ASCEND_VERSION_CFG})
   message(STATUS "ASCEND_VERSION: ${CMAKE_MATCH_1}")
@@ -59,7 +59,9 @@ set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler)
 set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin)
 set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen)
 set(ASCEND_KERNEL_PATH ${CMAKE_BINARY_DIR}/kernels)
-set(MX_DRIVING_PATH ${PROJECT_SOURCE_DIR}/mx_driving)
+set(ASCEND_CSRC_SRC
+    ""
+    CACHE STRING "csrc source files")
 set(ASCEND_HOST_SRC
     ""
     CACHE STRING "host source files")
diff --git a/cmake/func.cmake b/cmake/func.cmake
index 3f532ae4..9acc3038 100644
--- a/cmake/func.cmake
+++ b/cmake/func.cmake
@@ -1,9 +1,8 @@
 function(install_target)
   cmake_parse_arguments(INSTALL_TARGET "" "DST;TRG" "" ${ARGN})
   set_target_properties(
-    ${INSTALL_TARGET_TRG}
-    PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-               ${MX_DRIVING_PATH}/${INSTALL_TARGET_DST})
+    ${INSTALL_TARGET_TRG} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                                     ${MX_DRIVING_PATH}/${INSTALL_TARGET_DST})
   install(TARGETS ${INSTALL_TARGET_TRG}
           LIBRARY DESTINATION ${INSTALL_TARGET_DST})
 endfunction()
@@ -41,33 +40,31 @@ function(opbuild)
                         "OPS_SRC" ${ARGN})
   set(CANN_INCLUDE_PATH "")
   set(CANN_LIB_PATH "")
-  ## if the CANN_PATHS not empty
+  # if the CANN_PATHS not empty
   if(CANN_PATHS)
-    ## if the arch is aarch64, add the include path
+    # if the arch is aarch64, add the include path
     if(${ARCH} STREQUAL "aarch64")
       set(CANN_INCLUDE_PATH ${CANN_PATHS}/aarch64-linux/include)
       set(CANN_LIB_PATH ${CANN_PATHS}/aarch64-linux/lib64)
-    else ()
+    else()
       set(CANN_INCLUDE_PATH ${CANN_PATHS}/x86_64-linux/include)
       set(CANN_LIB_PATH ${CANN_PATHS}/x86_64-linux/lib64)
     endif()
   endif()
   if(NOT EXISTS ${CANN_INCLUDE_PATH})
-      message(FATAL_ERROR "CANN include path not found: ${CANN_PATHS}")
+    message(FATAL_ERROR "CANN include path not found: ${CANN_PATHS}")
   endif()
-    if(NOT EXISTS ${CANN_LIB_PATH})
-        message(FATAL_ERROR "CANN lib path not found: ${CANN_PATHS}")
+  if(NOT EXISTS ${CANN_LIB_PATH})
+    message(FATAL_ERROR "CANN lib path not found: ${CANN_PATHS}")
   endif()
   message(STATUS "CANN include path: ${CANN_INCLUDE_PATH}")
   message(STATUS "CANN lib path: ${CANN_LIB_PATH}")
   # filter single op
-  if (NOT "${SINGLE_OP}x" STREQUAL "x")
-    list(FILTER OPBUILD_OPS_SRC INCLUDE REGEX ${SINGLE_OP})
-  endif()
   execute_process(
     COMMAND
       ${CMAKE_COMPILE} -g -fPIC -shared -std=c++11 ${OPBUILD_OPS_SRC}
-      -D_GLIBCXX_USE_CXX11_ABI=0 -I ${CANN_INCLUDE_PATH} -L ${CANN_LIB_PATH} -lexe_graph -lregister -ltiling_api -o
+      -D_GLIBCXX_USE_CXX11_ABI=0 -I ${CANN_INCLUDE_PATH} -L ${CANN_LIB_PATH}
+      -lexe_graph -lregister -ltiling_api -o
       ${OPBUILD_OUT_DIR}/libascend_all_ops.so
     RESULT_VARIABLE EXEC_RESULT
     OUTPUT_VARIABLE EXEC_INFO
@@ -180,61 +177,68 @@ function(add_bin_compile_target)
   endif()
   add_custom_target(${BINCMP_TARGET} COMMAND cp -r ${BINCMP_IMPL_DIR}/*.*
                                              ${BINCMP_OUT_DIR}/src)
-  add_custom_target(
-    ${BINCMP_TARGET}_gen_ops_config ALL
-    COMMAND
-      ${ASCEND_PYTHON_EXECUTABLE}
-      ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p
-      ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}
-    COMMAND
-      ${ASCEND_PYTHON_EXECUTABLE}
-      ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p
-      ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} -s ${BINCMP_COMPUTE_UNIT})
-  file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*.sh)
-  foreach(bin_script ${bin_scripts})
-    get_filename_component(bin_file ${bin_script} NAME_WE)
-    string(REPLACE "-" ";" bin_sep ${bin_file})
-    list(GET bin_sep 0 op_type)
-    list(GET bin_sep 1 op_file)
-    list(GET bin_sep 2 op_index)
-    if(NOT TARGET ${BINCMP_TARGET}_${op_file}_copy)
-      add_custom_target(
-        ${BINCMP_TARGET}_${op_file}_copy
-        COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py
-                ${BINCMP_OUT_DIR}/src/${op_type}.py
-        DEPENDS ascendc_impl_gen)
-      install(
-        DIRECTORY ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}
-        DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT}
-        OPTIONAL)
-      install(
-        FILES ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/${op_file}.json
-        DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}
-        OPTIONAL)
-    endif()
+
+  file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*${KERNEL_NAME}*.sh)
+  # if bin_scripts not empty
+  if(bin_scripts)
     add_custom_target(
-      ${BINCMP_TARGET}_${op_file}_${op_index}
+      ${BINCMP_TARGET}_gen_ops_config ALL
       COMMAND
-        export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && export
-        ASCEND_CUSTOM_OPP_PATH=${MX_DRIVING_PATH}/packages/vendors/${vendor_name}
-        && bash ${CMAKE_SOURCE_DIR}/scripts/retry.sh \"bash ${bin_script} ${BINCMP_OUT_DIR}/src/${op_type}.py
-${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}\"
-      WORKING_DIRECTORY ${BINCMP_OUT_DIR})
-    add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET}
-                     ${BINCMP_TARGET}_${op_file}_copy)
-    add_dependencies(${BINCMP_TARGET}_gen_ops_config
-                     ${BINCMP_TARGET}_${op_file}_${op_index})
-  endforeach()
-  add_custom_command(
-    TARGET ${BINCMP_TARGET}_gen_ops_config
-    POST_BUILD
-    COMMAND mv ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/*.json
-            ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT})
-  install(
-    FILES
-      ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/binary_info_config.json
-    DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}
-    OPTIONAL)
+        ${ASCEND_PYTHON_EXECUTABLE}
+        ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p
+        ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}
+      COMMAND
+        ${ASCEND_PYTHON_EXECUTABLE}
+        ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p
+        ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT} -s ${BINCMP_COMPUTE_UNIT})
+
+    foreach(bin_script ${bin_scripts})
+      get_filename_component(bin_file ${bin_script} NAME_WE)
+      string(REPLACE "-" ";" bin_sep ${bin_file})
+      list(GET bin_sep 0 op_type)
+      list(GET bin_sep 1 op_file)
+      list(GET bin_sep 2 op_index)
+      if(NOT TARGET ${BINCMP_TARGET}_${op_file}_copy)
+        add_custom_target(
+          ${BINCMP_TARGET}_${op_file}_copy
+          COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py
+                  ${BINCMP_OUT_DIR}/src/${op_type}.py
+          DEPENDS ascendc_impl_gen)
+        install(
+          DIRECTORY ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}
+          DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT}
+          OPTIONAL)
+        install(
+          FILES
+            ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/${op_file}.json
+          DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}
+          OPTIONAL)
+      endif()
+      add_custom_target(
+        ${BINCMP_TARGET}_${op_file}_${op_index}
+        COMMAND
+          export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && export
+          ASCEND_CUSTOM_OPP_PATH=${MX_DRIVING_PATH}/packages/vendors/${vendor_name}
+          && bash ${CMAKE_SOURCE_DIR}/scripts/retry.sh \"bash ${bin_script}
+          ${BINCMP_OUT_DIR}/src/${op_type}.py
+          ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/${op_file}\"
+        WORKING_DIRECTORY ${BINCMP_OUT_DIR})
+      add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET}
+                       ${BINCMP_TARGET}_${op_file}_copy)
+      add_dependencies(${BINCMP_TARGET}_gen_ops_config
+                       ${BINCMP_TARGET}_${op_file}_${op_index})
+    endforeach()
+    add_custom_command(
+      TARGET ${BINCMP_TARGET}_gen_ops_config
+      POST_BUILD
+      COMMAND mv ${BINCMP_KERNEL_DIR}/${BINCMP_COMPUTE_UNIT}/*.json
+              ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT})
+    install(
+      FILES
+        ${BINCMP_KERNEL_DIR}/config/${BINCMP_COMPUTE_UNIT}/binary_info_config.json
+      DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}
+      OPTIONAL)
+  endif()
 endfunction()
 
 function(protobuf_generate)
diff --git a/cmake/stage_0.cmake b/cmake/stage_0.cmake
new file mode 100644
index 00000000..426e8015
--- /dev/null
+++ b/cmake/stage_0.cmake
@@ -0,0 +1,11 @@
+add_library(ascend_all_ops SHARED ${ASCEND_HOST_SRC})
+target_compile_options(ascend_all_ops PRIVATE -g -fPIC -std=c++11
+                                              -D_GLIBCXX_USE_CXX11_ABI=0)
+target_include_directories(ascend_all_ops PRIVATE ${CANN_INCLUDE_PATH})
+target_link_libraries(ascend_all_ops PRIVATE intf_pub exe_graph register
+                                             tiling_api)
+add_custom_command(
+  TARGET ascend_all_ops
+  POST_BUILD
+  COMMAND ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+          $<TARGET_FILE:ascend_all_ops> ${ASCEND_AUTOGEN_PATH})
diff --git a/cmake/stage_1.cmake b/cmake/stage_1.cmake
new file mode 100644
index 00000000..9a176248
--- /dev/null
+++ b/cmake/stage_1.cmake
@@ -0,0 +1,212 @@
+# ===================Build proto ===================
+add_library(cust_op_proto SHARED ${ASCEND_AUTOGEN_PATH}/op_proto.cc)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE -fvisibility=hidden)
+target_link_libraries(
+  cust_op_proto
+  PRIVATE intf_pub
+          exe_graph
+          register
+          tiling_api
+          -Wl,--whole-archive
+          rt2_registry
+          -Wl,--no-whole-archive)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME cust_opsproto_rt2.0)
+install_target(
+  TRG cust_op_proto DST
+  packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+install_file(TRG cust_op_proto SRC ${ASCEND_AUTOGEN_PATH}/op_proto.h DST
+             packages/vendors/${vendor_name}/op_proto/inc)
+
+add_library(cust_optiling SHARED ${ASCEND_HOST_SRC})
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE -fvisibility=hidden)
+target_link_libraries(
+  cust_optiling
+  PRIVATE intf_pub
+          exe_graph
+          register
+          tiling_api
+          -Wl,--whole-archive
+          rt2_registry
+          -Wl,--no-whole-archive)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME cust_opmaster_rt2.0)
+install_target(
+  TRG
+  cust_optiling
+  DST
+  packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}
+)
+# create liboptiling.so link
+add_custom_command(
+  TARGET cust_optiling
+  POST_BUILD
+  COMMAND
+    ${CMAKE_COMMAND} -E chdir
+    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling
+    ${CMAKE_COMMAND} -E create_symlink
+    lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+    liboptiling.so)
+install(
+  FILES
+    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/liboptiling.so
+  DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+
+if(${ENABLE_ONNX})
+  if(CANN_PATHS)
+    if(${ARCH} STREQUAL "aarch64")
+      protobuf_generate(
+        PROTO_FILE ${CANN_PATHS}/aarch64-linux/include/proto/ge_onnx.proto
+        OUT_DIR ${ASCEND_AUTOGEN_PATH})
+    else()
+      protobuf_generate(
+        PROTO_FILE ${CANN_PATHS}/x86_64-linux/include/proto/ge_onnx.proto
+        OUT_DIR ${ASCEND_AUTOGEN_PATH})
+    endif()
+  else()
+    protobuf_generate(
+      PROTO_FILE ${ASCEND_CANN_PACKAGE_PATH}/include/proto/ge_onnx.proto
+      OUT_DIR ${ASCEND_AUTOGEN_PATH})
+  endif()
+
+  add_library(cust_onnx_parsers SHARED ${ASCEND_ONNX_SRC})
+  target_compile_options(
+    cust_onnx_parsers
+    PRIVATE -O2 -Werror -Wno-deprecated-declarations -Dgoogle=ascend_private
+            "-fno-common" "-fno-strict-aliasing")
+  target_link_libraries(cust_onnx_parsers PRIVATE intf_pub)
+  target_include_directories(
+    cust_onnx_parsers PRIVATE ${PROJECT_SOURCE_DIR}/include
+                              ${ASCEND_AUTOGEN_PATH})
+
+  install_target(TRG cust_onnx_parsers DST
+                 packages/vendors/${vendor_name}/framework/onnx/)
+endif()
+
+# ===================Build ACLNN===================
+file(GLOB ACLNN_SRC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB ACLNN_INC_GEN ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+set(ACLNN_SRC ${ACLNN_SRC_GEN} ${ACLNN_SRC_CUSTOM})
+set(ACLNN_INC ${ACLNN_INC_GEN} ${ACLNN_INC_CUSTOM})
+add_library(cust_opapi SHARED ${ACLNN_SRC})
+target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase opapi)
+install_target(TRG cust_opapi DST packages/vendors/${vendor_name}/op_api/lib)
+install_file(TRG cust_opapi SRC ${ACLNN_INC} DST
+             packages/vendors/${vendor_name}/op_api/include)
+
+# ===================Build Kernel===================
+# set custom compile options
+if("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+  add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+
+file(COPY ${ASCEND_KERNEL_SRC} DESTINATION ${ASCEND_KERNEL_PATH})
+
+foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+  if(EXISTS ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini)
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(
+      TARGET
+      ops_info_gen_${compute_unit}
+      OUTPUT
+      ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}/aic-${compute_unit}-ops-info.json
+      OPS_INFO
+      ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+      INSTALL_DIR
+      packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # generate ascendc impl py once
+    if(NOT TARGET ascendc_impl_gen)
+      add_ops_impl_target(
+        TARGET
+        ascendc_impl_gen
+        OPS_INFO
+        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        IMPL_DIR
+        ${ASCEND_KERNEL_PATH}
+        OUT_DIR
+        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl
+      )
+      install_file(
+        TRG
+        ascendc_impl_gen
+        SRC
+        ${ASCEND_KERNEL_SRC}
+        DST
+        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+      )
+    endif()
+
+    # dynamic shape binary compile
+    if(${ENABLE_BINARY_PACKAGE})
+      add_bin_compile_target(
+        TARGET
+        ascendc_bin_${compute_unit}
+        OPS_INFO
+        ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+        IMPL_DIR
+        ${ASCEND_KERNEL_PATH}
+        ADP_DIR
+        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic
+        OUT_DIR
+        ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+        KERNEL_DIR
+        ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+        INSTALL_DIR
+        packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel
+        COMPUTE_UNIT
+        ${compute_unit})
+      add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen
+                       cust_optiling)
+    endif()
+  endif()
+endforeach()
+
+# generate npu_supported_ops.json
+add_npu_support_target(
+  TARGET
+  npu_supported_ops
+  OPS_INFO_DIR
+  ${ASCEND_AUTOGEN_PATH}
+  OUT_DIR
+  ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_info_cfg/ai_core
+  INSTALL_DIR
+  packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE})
+
+# ===================Build test===================
+# WARN: WIP
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+  add_subdirectory(testcases)
+endif()
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(
+  gen_version_info ALL
+  COMMAND
+    bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh
+    ${ASCEND_CANN_PACKAGE_PATH}
+    ${MX_DRIVING_PATH}/packages/vendors/${vendor_name})
+
+install(FILES ${MX_DRIVING_PATH}/packages/vendors/${vendor_name}/version.info
+        DESTINATION packages/vendors/${vendor_name})
+
+if(COMPILE_OPP_PACKAGE)
+  # CPack config
+  set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+  set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+  set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+  set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+  set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
+  set(CPACK_GENERATOR External)
+  set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+  set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+  set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+  set(CPACK_EXTERNAL_BUILT_PACKAGES
+      ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}
+  )
+  include(CPack)
+endif()
diff --git a/cmake/stage_2.cmake b/cmake/stage_2.cmake
new file mode 100644
index 00000000..2f736ef3
--- /dev/null
+++ b/cmake/stage_2.cmake
@@ -0,0 +1,48 @@
+set(Python3_USE_STATIC_LIBS FALSE)
+find_package(Python3 COMPONENTS Interpreter Development)
+
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} -c
+          "import os; import torch; print(os.path.dirname(torch.__file__))"
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE TORCH_PATH)
+execute_process(
+  COMMAND
+    ${Python3_EXECUTABLE} -c
+    "import os; import site; print(site.getsitepackages()[0] + '/torch_npu')"
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  OUTPUT_VARIABLE TORCH_NPU_PATH)
+message("TORCH_PATH is ${TORCH_PATH}")
+message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+
+set(EXT_CXX_FLAGS "${EXT_CXX_FLAGS}")
+separate_arguments(EXT_CXX_FLAGS)
+add_library(_C SHARED ${ASCEND_CSRC_SRC})
+if(${COMPILE_WITH_XLA})
+  target_compile_definitions(_C PRIVATE COMPILE_WITH_XLA)
+endif()
+target_compile_options(
+  _C
+  PRIVATE -fprofile-arcs
+          -ftest-coverage
+          -fPIC
+          -g
+          -O3
+          -fstack-protector-all
+          -DTORCH_API_INCLUDE_EXTENSION_H
+          -DTORCH_EXTENSION_NAME=_C
+          -D_GLIBCXX_USE_CXX11_ABI=0
+          -D__FILENAME__=__FILE__
+          ${EXT_CXX_FLAGS})
+target_link_directories(_C PRIVATE ${TORCH_PATH}/lib ${TORCH_NPU_PATH}/lib)
+target_include_directories(
+  _C
+  PRIVATE ${Python3_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include
+          ${TORCH_NPU_PATH}/include ${TORCH_PATH}/include
+          ${TORCH_PATH}/include/torch/csrc/api/include)
+target_link_libraries(_C PRIVATE gcov c10 torch torch_python torch_npu)
+set_target_properties(
+  _C
+  PROPERTIES OUTPUT_NAME "${MX_DRIVING_PATH}/_C.${Python3_SOABI}"
+             PREFIX ""
+             SUFFIX ".so")
diff --git a/cmake/util/ascendc_bin_param_build.py b/cmake/util/ascendc_bin_param_build.py
index 63b15c90..a46d397f 100644
--- a/cmake/util/ascendc_bin_param_build.py
+++ b/cmake/util/ascendc_bin_param_build.py
@@ -110,6 +110,8 @@ class BinParamBuilder(opdesc_parser.OpDesc):
             fd.write(chk)
             chk = const_var.CHK_CMD.format(res_file=bin_file + '.o')
             fd.write(chk)
+            chm = const_var.CHM_CMD
+            fd.write(chm)
             fd.write('echo "[{}] Generating {} Done"\n'.format(hard_soc, bin_file))
 
 
diff --git a/cmake/util/const_var.py b/cmake/util/const_var.py
index 85988e9e..f0d28170 100644
--- a/cmake/util/const_var.py
+++ b/cmake/util/const_var.py
@@ -29,5 +29,6 @@ if ! test -f $2/{res_file} ; then
   exit 1
 fi
 '''
+CHM_CMD = 'chmod -R 755 $2\n'
 ATTR_DEF_VAL = {'str' : '', 'int': 0, 'float': 0.0, 'bool': False, 'list_bool': [],
                 'list_int': [], 'list_float': [], 'list_list_int': [[]]}
diff --git a/docs/api/README.md b/docs/api/README.md
index 3afd2d59..381bb23d 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -871,7 +871,7 @@ voxel_feats, voxel_coors = npu_dynamic_scatter(feats, coors, 'max')
 ## unique_voxel
 ### 接口原型
 ```python
-ads_c.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor
+mx_driving._C.unique_voxel(Tensor voxels) -> int, Tensor, Tensor, Tensor, Tensor
 ```
 ### 功能描述
 对输入的点云数据进行去重处理。
@@ -894,7 +894,7 @@ N的大小受限于内存大小，建议N小于等于2^32。
 import torch
 import torch_npu
 import numpy as np
-from ads_c import unique_voxel
+from mx_driving._C import unique_voxel
 voxels = np.random.randint(0, 1024, (100000,)).astype(np.int32)
 voxels_npu = torch.from_numpy(voxels).npu()
 num_voxels, uni_voxels, uni_indices, argsort_indices, uni_argsort_indices = unique_voxel(voxels_npu)
diff --git a/include/csrc/pybind.h b/include/csrc/pybind.h
index b5eb79e4..49ac2037 100644
--- a/include/csrc/pybind.h
+++ b/include/csrc/pybind.h
@@ -15,7 +15,7 @@
 // limitations under the License.
 #ifndef CSRC_PYBIND_H_
 #define CSRC_PYBIND_H_
-#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
 
 void init_common(pybind11::module& m);
 void init_fused(pybind11::module& m);
diff --git a/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py b/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py
index 48447994..c5c69fe3 100644
--- a/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py
+++ b/model_examples/PanoOcc/projects/mmdet3d_plugin/bevformer/dense_heads/panoseg_occ_head.py
@@ -26,16 +26,16 @@ import mmcv
 import cv2 as cv
 from projects.mmdet3d_plugin.models.utils.visual import save_tensor
 import mx_driving.common
-import ads_c
+import mx_driving._C
 from mmdet.models.builder import build_loss
 
 
 def custom_unique_n3(coors, return_inverse, return_counts, dim):
     # assert dim == 0
 
-    voxels = ads_c.point_to_voxel(coors, [], [], "ZYX")
-    cnt, unq_voxels, unq_ind, argsort_ind, _ = ads_c.unique_voxel(voxels)
-    unq_coors = ads_c.voxel_to_point(unq_voxels, [], [], "ZYX")
+    voxels = mx_driving._C.point_to_voxel(coors, [], [], "ZYX")
+    cnt, unq_voxels, unq_ind, argsort_ind, _ = mx_driving._C.unique_voxel(voxels)
+    unq_coors = mx_driving._C.voxel_to_point(unq_voxels, [], [], "ZYX")
     
     if return_inverse:
         sorted_ind = torch.argsort(argsort_ind.to(torch.float32), dim=dim).to(torch.long)
diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py
index 3b73dec6..3ab0077d 100644
--- a/mx_driving/__init__.py
+++ b/mx_driving/__init__.py
@@ -2,7 +2,7 @@ import os
 
 import torch
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 def _set_env():
@@ -15,7 +15,7 @@ def _set_env():
     os.environ["ASCEND_CUSTOM_OPP_PATH"] = ascend_custom_opp_path
 
     mx_driving_op_api_so_path = os.path.join(mx_driving_opp_path, "op_api", "lib", "libcust_opapi.so")
-    ads_c._init_op_api_so_path(mx_driving_op_api_so_path)
+    mx_driving._C._init_op_api_so_path(mx_driving_op_api_so_path)
 
 
 _set_env()
diff --git a/mx_driving/common/CMakeLists.txt b/mx_driving/common/CMakeLists.txt
index 0a8fa99d..807aa0c6 100644
--- a/mx_driving/common/CMakeLists.txt
+++ b/mx_driving/common/CMakeLists.txt
@@ -2,3 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
   add_subdirectory(ops/kernels)
 endif()
 
+if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
+  add_subdirectory(ops/onnx/plugin)
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/common/ops/assign_score_withk.py b/mx_driving/common/ops/assign_score_withk.py
index c375eaf4..f17773ad 100644
--- a/mx_driving/common/ops/assign_score_withk.py
+++ b/mx_driving/common/ops/assign_score_withk.py
@@ -6,24 +6,25 @@ Modification date: 2024-10-06
 Modification Description: 
 Modification 1. Add support for Ascend NPU
 """
+
 import torch
+import torch_npu
 from torch.autograd import Function
 from torch.nn import Module
 
-import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AssignScoreWithkFunction(Function):
     @staticmethod
     def forward(ctx, *args):
         scores, point_features, center_features, knn_idx, aggregate = args
-        agg = {'sum': 0, 'avg': 1, 'max': 2}
+        agg = {"sum": 0, "avg": 1, "max": 2}
         B, N, M, out_dim = point_features.size()
         _, npoint, K, _ = scores.size()
         agg_idx = 0 if aggregate not in agg.keys() else agg[aggregate]
         output = point_features.new_zeros((B, out_dim, npoint, K))
-        ads_c.assign_score_withk(
+        mx_driving._C.assign_score_withk(
             point_features.contiguous(),
             center_features.contiguous(),
             scores.contiguous(),
@@ -38,4 +39,4 @@ class AssignScoreWithkFunction(Function):
             agg_idx
         )
         return output
-assign_score_withk = AssignScoreWithkFunction.apply
\ No newline at end of file
+assign_score_withk = AssignScoreWithkFunction.apply
diff --git a/mx_driving/common/ops/csrc/CMakeLists.txt b/mx_driving/common/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/common/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/common/ops/hypot.py b/mx_driving/common/ops/hypot.py
index df4315f0..0eb760bd 100644
--- a/mx_driving/common/ops/hypot.py
+++ b/mx_driving/common/ops/hypot.py
@@ -1,26 +1,29 @@
 """
 Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 """
+
 import torch
+import torch_npu
 from torch.autograd import Function
 
-import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class Hypot(Function):
     @staticmethod
     def forward(ctx, x, y):
         x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y)
-        out = ads_c.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous())
-        ctx.save_for_backward(x, y, out);
+        out = mx_driving._C.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous())
+        ctx.save_for_backward(x, y, out)
         return out
 
     @staticmethod
     def backward(ctx, out_grad):
         x, y, out = ctx.saved_tensors
         x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y)
-        x_grad, y_grad = ads_c.npu_hypot_grad(x_broadcasted.contiguous(), y_broadcasted.contiguous(), out, out_grad)
+        x_grad, y_grad = mx_driving._C.npu_hypot_grad(
+            x_broadcasted.contiguous(), y_broadcasted.contiguous(), out, out_grad
+        )
 
         # reshape the broadcasted tensors to origin tensors and sum the grad
         for dim, size in enumerate(x.shape):
@@ -32,4 +35,5 @@ class Hypot(Function):
 
         return x_grad, y_grad
 
+
 hypot = Hypot.apply
diff --git a/mx_driving/common/ops/knn.py b/mx_driving/common/ops/knn.py
index 0079669c..bbd9cd5a 100644
--- a/mx_driving/common/ops/knn.py
+++ b/mx_driving/common/ops/knn.py
@@ -12,7 +12,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsKnn(Function):
@@ -43,7 +43,7 @@ class AdsKnn(Function):
             print('center_xyz and xyz should be on the same device.')
             return None
 
-        dist2, idx = ads_c.knn(xyz, center_xyz, k, True)
+        dist2, idx = mx_driving._C.knn(xyz, center_xyz, k, True)
         zeros_idx = torch.zeros(xyz.shape[0], center_xyz.shape[1], k, dtype=torch.int32).npu()
         idx.where(dist2 >= 1e10, zeros_idx)
         idx = idx.transpose(2, 1).contiguous() # [B, k, npoint]
@@ -51,4 +51,4 @@ class AdsKnn(Function):
         return idx.int()
 
 
-knn = AdsKnn.apply
\ No newline at end of file
+knn = AdsKnn.apply
diff --git a/mx_driving/common/ops/npu_hypot.py b/mx_driving/common/ops/npu_hypot.py
new file mode 100644
index 00000000..25cbfe8b
--- /dev/null
+++ b/mx_driving/common/ops/npu_hypot.py
@@ -0,0 +1,18 @@
+"""
+Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+"""
+import torch
+from torch.autograd import Function
+
+import torch_npu
+import mx_driving._C
+
+
+class Hypot(Function):
+    @staticmethod
+    def forward(ctx, x, y):
+        x_broadcasted, y_broadcasted = torch.broadcast_tensors(x, y)
+        out = mx_driving._C.npu_hypot(x_broadcasted.contiguous(), y_broadcasted.contiguous())
+        return out
+
+npu_hypot = Hypot.apply
\ No newline at end of file
diff --git a/mx_driving/common/ops/npu_scatter_mean_grad.py b/mx_driving/common/ops/npu_scatter_mean_grad.py
index 57a597cf..d460f7bf 100644
--- a/mx_driving/common/ops/npu_scatter_mean_grad.py
+++ b/mx_driving/common/ops/npu_scatter_mean_grad.py
@@ -6,12 +6,12 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class ScatterMeanGradFunction(Function):
     @staticmethod
     def forward(ctx, grad_out, index, dim):
-        result = ads_c.npu_scatter_mean_grad(grad_out, index, dim)
+        result = mx_driving._C.npu_scatter_mean_grad(grad_out, index, dim)
         return result
 npu_scatter_mean_grad = ScatterMeanGradFunction.apply
\ No newline at end of file
diff --git a/mx_driving/common/ops/scatter_max.py b/mx_driving/common/ops/scatter_max.py
index 2cba77bb..b30c6139 100644
--- a/mx_driving/common/ops/scatter_max.py
+++ b/mx_driving/common/ops/scatter_max.py
@@ -11,13 +11,13 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class ScatterMaxFunction(Function):
     @staticmethod
     def forward(ctx, updates, indices, out=None):
-        func = ads_c.scatter_max_with_argmax_v2
+        func = mx_driving._C.scatter_max_with_argmax_v2
         out, argmax = func(updates, indices, out)
         ctx.save_for_backward(argmax, updates)
         return out, argmax
@@ -33,7 +33,7 @@ class ScatterMaxFunction(Function):
         grad_updates_indices_uss = grad_updates_indices[..., 0] * grad_updates_indices.shape[1] + grad_updates_indices[..., 1]
         num_segments = torch.tensor(updates.shape[0] * updates.shape[1]).to(device)
 
-        grad = ads_c.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments)
+        grad = mx_driving._C.npu_scatter_max_backward(grad_output, grad_updates_indices_uss, num_segments)
 
         return grad.reshape(updates.shape), None, None
 
diff --git a/mx_driving/common/ops/scatter_mean.py b/mx_driving/common/ops/scatter_mean.py
index 61d8019d..cd35b8bb 100644
--- a/mx_driving/common/ops/scatter_mean.py
+++ b/mx_driving/common/ops/scatter_mean.py
@@ -3,13 +3,13 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class ScatterMeanFunction(Function):
     @staticmethod
     def forward(ctx, src, index, out=None, dim=0, dim_size=None):
-        func = ads_c.npu_scatter_mean
+        func = mx_driving._C.npu_scatter_mean
         res, count = func(src, index, out, dim, dim_size)
         return res
 
diff --git a/mx_driving/common/ops/sort_pairs.py b/mx_driving/common/ops/sort_pairs.py
index 99577885..e6c6f0b4 100644
--- a/mx_driving/common/ops/sort_pairs.py
+++ b/mx_driving/common/ops/sort_pairs.py
@@ -1,11 +1,13 @@
-import ads_c
 import torch
 
+import mx_driving._C
+
 
 class SortPairs(torch.autograd.Function):
     @staticmethod
     def forward(ctx, keys_in, values_in, dim, descending=False):
-        res = ads_c.npu_sort_pairs(keys_in, values_in, dim, descending)
+        res = mx_driving._C.npu_sort_pairs(keys_in, values_in, dim, descending)
         return res
 
+
 sort_pairs = SortPairs.apply
diff --git a/mx_driving/common/ops/threeNN.py b/mx_driving/common/ops/threeNN.py
index 11047daf..d259e5d0 100644
--- a/mx_driving/common/ops/threeNN.py
+++ b/mx_driving/common/ops/threeNN.py
@@ -12,7 +12,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsThreeNN(Function):
@@ -27,11 +27,11 @@ class AdsThreeNN(Function):
             target = target.float()
             source = source.float()
 
-        dist2, idx = ads_c.knn(source, target, 3, False)
+        dist2, idx = mx_driving._C.knn(source, target, 3, False)
         dist2 = torch.sqrt(dist2)
         if dtype_ == torch.float16:
             dist2 = dist2.half()
         return dist2, idx.int()
 
 
-three_nn = AdsThreeNN.apply
\ No newline at end of file
+three_nn = AdsThreeNN.apply
diff --git a/mx_driving/common/ops/three_interpolate.py b/mx_driving/common/ops/three_interpolate.py
index 05526652..ed237c10 100644
--- a/mx_driving/common/ops/three_interpolate.py
+++ b/mx_driving/common/ops/three_interpolate.py
@@ -14,7 +14,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class ThreeInterpolateFunction(Function):
@@ -27,7 +27,7 @@ class ThreeInterpolateFunction(Function):
         n = indices.size(1)
         ctx.three_interpolate_for_backward = (indices, weight, m)
 
-        func = ads_c.npu_three_interpolate
+        func = mx_driving._C.npu_three_interpolate
         out = func(b, c, m, n, features, indices, weight)
 
         return out
@@ -42,7 +42,7 @@ class ThreeInterpolateFunction(Function):
         grad_out_data = grad_out.data.contiguous().to(torch.float)
         weight = weight.to(torch.float)
 
-        grad_features = ads_c.npu_three_interpolate_backward(b, c, n, m, grad_out_data, idx, weight)
+        grad_features = mx_driving._C.npu_three_interpolate_backward(b, c, n, m, grad_out_data, idx, weight)
 
         if grad_out_dtype == torch.half:
             grad_features = grad_features.to(torch.half)
diff --git a/mx_driving/detection/CMakeLists.txt b/mx_driving/detection/CMakeLists.txt
index 3f1ac043..63ebf651 100644
--- a/mx_driving/detection/CMakeLists.txt
+++ b/mx_driving/detection/CMakeLists.txt
@@ -2,6 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
   add_subdirectory(ops/kernels)
 endif()
 
-if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
+if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
   add_subdirectory(ops/onnx/plugin)
 endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/detection/ops/border_align.py b/mx_driving/detection/ops/border_align.py
index 242b6dbe..92fac131 100644
--- a/mx_driving/detection/ops/border_align.py
+++ b/mx_driving/detection/ops/border_align.py
@@ -1,14 +1,15 @@
 """
 Copyright (c) OpenMMLab. All rights reserved.
 """
+
 from typing import Any, Optional, Tuple, Union
 
 import torch
-import torch_npu
 import torch.nn as nn
+import torch_npu
 from torch.autograd import Function
 
-import ads_c
+import mx_driving._C
 
 
 class BorderAlignFunction(Function):
@@ -17,19 +18,27 @@ class BorderAlignFunction(Function):
         ctx.pooled_size = pooled_size
         ctx.feature_size = feature_map.size()
         batch_size, num_channels, data_height, data_width = feature_map.size()
-        output = torch.zeros([batch_size, data_height * data_width, ctx.pooled_size + 1, num_channels]).to(feature_map.device)
-
-        ads_c.border_align_forward_npu(
-            feature_map,
-            rois,
-            output,
-            ctx.pooled_size)
-        
+        output = torch.zeros([batch_size, data_height * data_width, ctx.pooled_size + 1, num_channels]).to(
+            feature_map.device
+        )
+
+        mx_driving._C.border_align_forward_npu(feature_map, rois, output, ctx.pooled_size)
+
         npu_outputs, index = output.max(dim=-2)
-        npu_outputs = npu_outputs.reshape([batch_size, data_height * data_width, 4, num_channels // 4]).permute([0, 3, 1, 2]).contiguous()
-        index = index.int().reshape([batch_size, data_height * data_width, 4, num_channels // 4]).permute([0, 3, 1, 2]).contiguous()
+        npu_outputs = (
+            npu_outputs.reshape([batch_size, data_height * data_width, 4, num_channels // 4])
+            .permute([0, 3, 1, 2])
+            .contiguous()
+        )
+        index = (
+            index.int()
+            .reshape([batch_size, data_height * data_width, 4, num_channels // 4])
+            .permute([0, 3, 1, 2])
+            .contiguous()
+        )
         ctx.save_for_backward(rois, index)
 
         return npu_outputs
 
-border_align = BorderAlignFunction.apply
\ No newline at end of file
+
+border_align = BorderAlignFunction.apply
diff --git a/mx_driving/detection/ops/box_iou.py b/mx_driving/detection/ops/box_iou.py
index 6c455f52..dd3cb320 100644
--- a/mx_driving/detection/ops/box_iou.py
+++ b/mx_driving/detection/ops/box_iou.py
@@ -1,27 +1,27 @@
-import ads_c
 import torch
 
+import mx_driving._C
+
 
 class BoxIouQuadri(torch.autograd.Function):
     @staticmethod
     def forward(ctx, boxes_a, boxes_b, mode, aligned):
-        mode_dict = {'iou': 0, 'iof': 1}
+        mode_dict = {"iou": 0, "iof": 1}
         mode_flag = mode_dict[mode]
-        
+
         boxes_a = boxes_a.contiguous()
         boxes_b = boxes_b.contiguous()
-        
-        ious = ads_c.npu_box_iou_quadri(
-            boxes_a, boxes_b, mode_flag, aligned)
+
+        ious = mx_driving._C.npu_box_iou_quadri(boxes_a, boxes_b, mode_flag, aligned)
         return ious
 
 
 class BoxIouRotated(torch.autograd.Function):
     @staticmethod
     def forward(ctx, boxes_a, boxes_b, mode, aligned, clockwise):
-        mode_dict = {'iou': 0, 'iof': 1}
+        mode_dict = {"iou": 0, "iof": 1}
         mode_flag = mode_dict[mode]
-        
+
         if not clockwise:
             flip_mat = boxes_a.new_ones(boxes_a.shape[-1])
             flip_mat[-1] = -1
@@ -29,9 +29,8 @@ class BoxIouRotated(torch.autograd.Function):
             boxes_b = boxes_b * flip_mat
         boxes_a = boxes_a.contiguous()
         boxes_b = boxes_b.contiguous()
-        
-        ious = ads_c.npu_box_iou_rotated(
-            boxes_a, boxes_b, mode_flag, aligned)
+
+        ious = mx_driving._C.npu_box_iou_rotated(boxes_a, boxes_b, mode_flag, aligned)
         return ious
 
 
diff --git a/mx_driving/detection/ops/boxes_overlap_bev.py b/mx_driving/detection/ops/boxes_overlap_bev.py
index 6b6d1874..cd92e3ab 100644
--- a/mx_driving/detection/ops/boxes_overlap_bev.py
+++ b/mx_driving/detection/ops/boxes_overlap_bev.py
@@ -1,13 +1,14 @@
 import warnings
 
-import ads_c
 import torch
 
+import mx_driving._C
+
 
 class BoxesOverlapBev(torch.autograd.Function):
     @staticmethod
     def forward(ctx, boxes_a, boxes_b):
-        area_overlap = ads_c.npu_boxes_overlap_bev(boxes_a, boxes_b)
+        area_overlap = mx_driving._C.npu_boxes_overlap_bev(boxes_a, boxes_b)
         return area_overlap
 
 
diff --git a/mx_driving/detection/ops/csrc/CMakeLists.txt b/mx_driving/detection/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/detection/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/detection/ops/nms3d_normal.py b/mx_driving/detection/ops/nms3d_normal.py
index 6e604f38..c6b297cc 100644
--- a/mx_driving/detection/ops/nms3d_normal.py
+++ b/mx_driving/detection/ops/nms3d_normal.py
@@ -10,7 +10,7 @@ import torch
 import torch_npu
 from torch.autograd import Function
 from torch.nn import Module
-import ads_c
+import mx_driving._C
 
 
 class AdsNms3dNormalFunction(Function):
@@ -21,7 +21,7 @@ class AdsNms3dNormalFunction(Function):
         order = scores.sort(0, descending=True)[1]
         boxes = boxes[order].contiguous()
 
-        keep, num_out = ads_c.nms3d_normal(boxes, iou_threshold)
+        keep, num_out = mx_driving._C.nms3d_normal(boxes, iou_threshold)
         return order[keep[:num_out].long()].contiguous()
 
 npu_nms3d_normal = AdsNms3dNormalFunction.apply
diff --git a/mx_driving/detection/ops/npu_nms3d.py b/mx_driving/detection/ops/npu_nms3d.py
index 744dfe98..0935ee9b 100644
--- a/mx_driving/detection/ops/npu_nms3d.py
+++ b/mx_driving/detection/ops/npu_nms3d.py
@@ -10,7 +10,7 @@ import torch
 from torch.autograd import Function
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class Nms3dFunction(Function):
@@ -21,7 +21,7 @@ class Nms3dFunction(Function):
         order = scores.sort(0, descending=True)[1]
         boxes = boxes[order].contiguous()
 
-        keep, num_out = ads_c.nms3d(boxes, iou_threshold)
+        keep, num_out = mx_driving._C.nms3d(boxes, iou_threshold)
         return order[keep[:num_out].long()].contiguous()
 
 
diff --git a/mx_driving/detection/ops/roi_align_rotated.py b/mx_driving/detection/ops/roi_align_rotated.py
index 7183c78a..3ed1c3ea 100644
--- a/mx_driving/detection/ops/roi_align_rotated.py
+++ b/mx_driving/detection/ops/roi_align_rotated.py
@@ -8,7 +8,7 @@ import torch_npu
 import torch.nn as nn
 from torch.autograd import Function
 
-import ads_c
+import mx_driving._C
 
 
 class RoIAlignRotatedFunction(Function):
@@ -28,7 +28,7 @@ class RoIAlignRotatedFunction(Function):
 
         output = feature_map.new_zeros(num_rois, ctx.pooled_height, ctx.pooled_width, num_channels).to(feature_map.device)
 
-        ads_c.roi_align_rotated_v2_forward_npu(
+        mx_driving._C.roi_align_rotated_v2_forward_npu(
             feature_map,
             rois,
             output,
@@ -46,7 +46,7 @@ class RoIAlignRotatedFunction(Function):
         feature_map, rois = ctx.saved_tensors
         rois_trans = torch.permute(rois, (1, 0)).contiguous()
         grad_output_trans = torch.permute(grad_output, (0, 2, 3, 1)).contiguous()
-        grad_feature_map = ads_c.npu_roi_align_rotated_grad_v2(
+        grad_feature_map = mx_driving._C.npu_roi_align_rotated_grad_v2(
             feature_map, rois_trans, grad_output_trans,
             ctx.pooled_height, ctx.pooled_width, ctx.spatial_scale,
             ctx.sampling_ratio, ctx.aligned, ctx.clockwise)
diff --git a/mx_driving/detection/ops/rotated_iou.py b/mx_driving/detection/ops/rotated_iou.py
index e82c7a09..a09d88c9 100644
--- a/mx_driving/detection/ops/rotated_iou.py
+++ b/mx_driving/detection/ops/rotated_iou.py
@@ -3,6 +3,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 """
 import torch
 import torch_npu
-import ads_c
+import mx_driving._C
 
-npu_rotated_iou = ads_c.npu_rotated_iou
+npu_rotated_iou = mx_driving._C.npu_rotated_iou
diff --git a/mx_driving/detection/ops/rotated_overlaps.py b/mx_driving/detection/ops/rotated_overlaps.py
index b992a72f..5afab83d 100644
--- a/mx_driving/detection/ops/rotated_overlaps.py
+++ b/mx_driving/detection/ops/rotated_overlaps.py
@@ -3,6 +3,6 @@ Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 """
 import torch
 import torch_npu
-import ads_c
+import mx_driving._C
 
-npu_rotated_overlaps = ads_c.npu_rotated_overlaps
+npu_rotated_overlaps = mx_driving._C.npu_rotated_overlaps
diff --git a/mx_driving/fused/CMakeLists.txt b/mx_driving/fused/CMakeLists.txt
index 4b3aa985..807aa0c6 100644
--- a/mx_driving/fused/CMakeLists.txt
+++ b/mx_driving/fused/CMakeLists.txt
@@ -4,4 +4,8 @@ endif()
 
 if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
   add_subdirectory(ops/onnx/plugin)
-endif()
\ No newline at end of file
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/fused/ops/csrc/CMakeLists.txt b/mx_driving/fused/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/fused/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/fused/ops/deform_conv2d.py b/mx_driving/fused/ops/deform_conv2d.py
index 6f191942..dcddc6ee 100644
--- a/mx_driving/fused/ops/deform_conv2d.py
+++ b/mx_driving/fused/ops/deform_conv2d.py
@@ -13,7 +13,7 @@ from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class DeformConv2dFunction(Function):
@@ -41,7 +41,7 @@ class DeformConv2dFunction(Function):
         nhwc_offset = offset.permute(0, 2, 3, 1).contiguous()
         nhwc_weight = weight.permute(0, 2, 3, 1).contiguous()
 
-        out, offset_output = ads_c.deformable_conv2d(
+        out, offset_output = mx_driving._C.deformable_conv2d(
             nhwc_x,
             nhwc_offset,
             nhwc_weight,
@@ -61,7 +61,7 @@ class DeformConv2dFunction(Function):
     def backward(ctx, grad_out):
         nhwc_x, nhwc_offset, nhwc_weight, offset_output = ctx.saved_tensors
         nhwc_grad_out = grad_out.permute(0, 2, 3, 1).contiguous()
-        grad_x, grad_weight, grad_offset = ads_c.deformable_conv2d_backward(
+        grad_x, grad_weight, grad_offset = mx_driving._C.deformable_conv2d_backward(
             nhwc_x,
             nhwc_weight,
             nhwc_offset,
diff --git a/mx_driving/fused/ops/fused_bias_leaky_relu.py b/mx_driving/fused/ops/fused_bias_leaky_relu.py
index aa2a7f79..1ed057af 100644
--- a/mx_driving/fused/ops/fused_bias_leaky_relu.py
+++ b/mx_driving/fused/ops/fused_bias_leaky_relu.py
@@ -11,13 +11,13 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class FusedBiasLeakyReluFunction(Function):
     @staticmethod
     def forward(ctx, x, bias, negative_slop=0.2, scale=2**0.5):
-        y = ads_c.fused_bias_leaky_relu(x, bias, negative_slop, scale)
+        y = mx_driving._C.fused_bias_leaky_relu(x, bias, negative_slop, scale)
         return y
 
 npu_fused_bias_leaky_relu = FusedBiasLeakyReluFunction.apply
\ No newline at end of file
diff --git a/mx_driving/fused/ops/modulated_deform_conv2d.py b/mx_driving/fused/ops/modulated_deform_conv2d.py
index b9030cda..8e5c1ead 100644
--- a/mx_driving/fused/ops/modulated_deform_conv2d.py
+++ b/mx_driving/fused/ops/modulated_deform_conv2d.py
@@ -14,7 +14,7 @@ from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class ModulatedDeformConv2dFunction(Function):
@@ -45,7 +45,7 @@ class ModulatedDeformConv2dFunction(Function):
         nhwc_weight = weight.permute(0, 2, 3, 1).contiguous()
         nhwc_mask = mask.permute(0, 2, 3, 1).contiguous()
 
-        out, offset_output = ads_c.modulated_deformable_conv2d(
+        out, offset_output = mx_driving._C.modulated_deformable_conv2d(
             nhwc_x,
             nhwc_offset,
             nhwc_mask,
@@ -68,7 +68,7 @@ class ModulatedDeformConv2dFunction(Function):
     def backward(ctx, grad_out):
         nhwc_x, nhwc_offset, nhwc_weight, nhwc_mask, offset_output = ctx.saved_tensors
         nhwc_grad_out = grad_out.permute(0, 2, 3, 1).contiguous()
-        grad_x, grad_weight, _, grad_offset, grad_mask = ads_c.modulated_deformable_conv2d_backward(
+        grad_x, grad_weight, _, grad_offset, grad_mask = mx_driving._C.modulated_deformable_conv2d_backward(
             nhwc_x,
             nhwc_offset,
             nhwc_mask,
diff --git a/mx_driving/fused/ops/npu_add_relu.py b/mx_driving/fused/ops/npu_add_relu.py
index e04feba8..aa08f38a 100644
--- a/mx_driving/fused/ops/npu_add_relu.py
+++ b/mx_driving/fused/ops/npu_add_relu.py
@@ -11,20 +11,20 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AddReluFunction(Function):
     @staticmethod
     def forward(ctx, x, y):
-        x = ads_c.npu_add_relu(x, y)
+        x = mx_driving._C.npu_add_relu(x, y)
         ctx.save_for_backward(x)
         return x
 
     @staticmethod
     def backward(ctx, grad_output):
         x, = ctx.saved_tensors
-        result = ads_c.npu_add_relu_grad(x, grad_output)
+        result = mx_driving._C.npu_add_relu_grad(x, grad_output)
         return result, result
 
 npu_add_relu = AddReluFunction.apply
\ No newline at end of file
diff --git a/mx_driving/fused/ops/npu_deformable_aggregation.py b/mx_driving/fused/ops/npu_deformable_aggregation.py
index 417faa49..d6076fb4 100644
--- a/mx_driving/fused/ops/npu_deformable_aggregation.py
+++ b/mx_driving/fused/ops/npu_deformable_aggregation.py
@@ -4,7 +4,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsDeformableAggregation(Function):
@@ -26,7 +26,7 @@ class AdsDeformableAggregation(Function):
         sampling_location = sampling_location.contiguous().float()
         weights = weights.contiguous().float()
 
-        output = ads_c.npu_deformable_aggregation(
+        output = mx_driving._C.npu_deformable_aggregation(
             mc_ms_feat,
             spatial_shape,
             scale_start_index,
@@ -60,7 +60,7 @@ class AdsDeformableAggregation(Function):
         grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
         grad_sampling_location = torch.zeros_like(sampling_location)
         grad_weights = torch.zeros_like(weights)
-        grad_mc_ms_feat, grad_sampling_location, grad_weights = ads_c.npu_deformable_aggregation_grad(
+        grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_grad(
             mc_ms_feat,
             spatial_shape,
             scale_start_index,
diff --git a/mx_driving/fused/ops/npu_max_pool2d.py b/mx_driving/fused/ops/npu_max_pool2d.py
index 38b68afb..fa4e72f0 100644
--- a/mx_driving/fused/ops/npu_max_pool2d.py
+++ b/mx_driving/fused/ops/npu_max_pool2d.py
@@ -7,14 +7,14 @@ Modification Description:
 Modification 1. Add support for Ascend NPU
 """
 from torch.autograd import Function
-import ads_c
+import mx_driving._C
 
 
 class MaxPool2d(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def forward(ctx, x, kernel_size, stride, padding):
-        y = ads_c.npu_max_pool2d(x, kernel_size, stride, padding)
+        y = mx_driving._C.npu_max_pool2d(x, kernel_size, stride, padding)
         return y
 
 npu_max_pool2d = MaxPool2d.apply
diff --git a/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py b/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py
index 9097d080..ffdb66c5 100644
--- a/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py
+++ b/mx_driving/fused/ops/npu_multi_scale_deformable_attn_function.py
@@ -12,21 +12,21 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class MultiScaleDeformableAttnFunction(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def forward(ctx, value, shape, offset, locations, weight):
-        result = ads_c.npu_multi_scale_deformable_attn_function(value, shape, offset, locations, weight)
+        result = mx_driving._C.npu_multi_scale_deformable_attn_function(value, shape, offset, locations, weight)
         ctx.save_for_backward(value, shape, offset, locations, weight)
         return result
 
     @staticmethod
     def backward(ctx, grad_output):
         value, shape, offset, locations, weight = ctx.saved_tensors
-        grad_value, grad_locations, grad_weight = ads_c.multi_scale_deformable_attn_grad(
+        grad_value, grad_locations, grad_weight = mx_driving._C.multi_scale_deformable_attn_grad(
             value, shape, offset, locations, weight, grad_output
         )
         return grad_value, None, None, grad_locations, grad_weight
diff --git a/mx_driving/point/CMakeLists.txt b/mx_driving/point/CMakeLists.txt
index 621d1fa9..63ebf651 100644
--- a/mx_driving/point/CMakeLists.txt
+++ b/mx_driving/point/CMakeLists.txt
@@ -1,3 +1,11 @@
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
   add_subdirectory(ops/kernels)
 endif()
+
+if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
+  add_subdirectory(ops/onnx/plugin)
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/point/ops/bev_pool.py b/mx_driving/point/ops/bev_pool.py
index 245533d7..4747b852 100644
--- a/mx_driving/point/ops/bev_pool.py
+++ b/mx_driving/point/ops/bev_pool.py
@@ -1,6 +1,7 @@
-import ads_c
 import torch
 
+import mx_driving._C
+
 
 class BEVPool(torch.autograd.Function):
     @staticmethod
@@ -14,7 +15,7 @@ class BEVPool(torch.autograd.Function):
         interval_lengths[-1] = feat.shape[0] - interval_starts[-1]
         geom_feat = geom_feat.int()
 
-        out = ads_c.npu_bev_pool(
+        out = mx_driving._C.npu_bev_pool(
             feat,
             geom_feat,
             interval_lengths,
@@ -36,7 +37,7 @@ class BEVPool(torch.autograd.Function):
         B, D, H, W = ctx.saved_shapes
 
         grad_out = grad_out.contiguous()
-        grad_feat = ads_c.npu_bev_pool_backward(
+        grad_feat = mx_driving._C.npu_bev_pool_backward(
             grad_out,
             geom_feat,
             interval_lengths,
diff --git a/mx_driving/point/ops/bev_pool_v2.py b/mx_driving/point/ops/bev_pool_v2.py
index e8895480..c21ba995 100644
--- a/mx_driving/point/ops/bev_pool_v2.py
+++ b/mx_driving/point/ops/bev_pool_v2.py
@@ -13,15 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ads_c
 import torch
 
+import mx_driving._C
+
 
 class BEVPoolV2(torch.autograd.Function):
     @staticmethod
     # pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev,
-                bev_feat_shape, interval_starts, interval_lengths):
+    def forward(
+        ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
+    ):
         ranks_bev = ranks_bev.int()
         depth = depth.contiguous().float()
         feat = feat.contiguous().float()
@@ -31,18 +33,8 @@ class BEVPoolV2(torch.autograd.Function):
         interval_starts = interval_starts.contiguous().int()
 
         (B, D, H, W, C) = bev_feat_shape
-        out = ads_c.npu_bev_pool_v2(
-            depth,
-            feat,
-            ranks_depth,
-            ranks_feat,
-            ranks_bev,
-            interval_lengths,
-            interval_starts,
-            B,
-            D,
-            H,
-            W
+        out = mx_driving._C.npu_bev_pool_v2(
+            depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts, B, D, H, W
         )
 
         ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
@@ -56,15 +48,12 @@ class BEVPoolV2(torch.autograd.Function):
         B, D, H, W = ctx.saved_shapes
 
         order = ranks_feat.argsort()
-        ranks_feat, ranks_depth, ranks_bev = \
-            ranks_feat[order], ranks_depth[order], ranks_bev[order]
-        kept = torch.ones(
-            ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        ranks_feat, ranks_depth, ranks_bev = ranks_feat[order], ranks_depth[order], ranks_bev[order]
+        kept = torch.ones(ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
         kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
         interval_starts_bp = torch.where(kept)[0].int()
         interval_lengths_bp = torch.zeros_like(interval_starts_bp)
-        interval_lengths_bp[:-1] = interval_starts_bp[
-                                   1:] - interval_starts_bp[:-1]
+        interval_lengths_bp[:-1] = interval_starts_bp[1:] - interval_starts_bp[:-1]
         interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1]
 
         depth = depth.contiguous()
@@ -76,7 +65,7 @@ class BEVPoolV2(torch.autograd.Function):
         interval_starts_bp = interval_starts_bp.contiguous()
         grad_out = grad_out.contiguous()
 
-        grad_depth, grad_feat = ads_c.npu_bev_pool_v2_backward(
+        grad_depth, grad_feat = mx_driving._C.npu_bev_pool_v2_backward(
             grad_out,
             depth,
             feat,
@@ -88,14 +77,13 @@ class BEVPoolV2(torch.autograd.Function):
             B,
             D,
             H,
-            W
+            W,
         )
         return grad_depth, grad_feat, None, None, None, None, None, None
 
 
 # pylint: disable=too-many-arguments,huawei-too-many-arguments
-def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
-                bev_feat_shape, interval_starts, interval_lengths):
+def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths):
     """
     bev_pool_v2 is a function that performs a pooling operation on the BEV.
     Please refer to the paper `BEVDet: High-performance Multi-camera 3D Object Detection in Bird-Eye-View`
@@ -132,8 +120,7 @@ def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
         >>> loss.backward()
     """
     x = BEVPoolV2.apply(
-        depth, feat, ranks_depth, ranks_feat, ranks_bev,
-        bev_feat_shape, interval_starts, interval_lengths
+        depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths
     )
     x = x.permute(0, 4, 1, 2, 3).contiguous()
     return x
diff --git a/mx_driving/point/ops/csrc/CMakeLists.txt b/mx_driving/point/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/point/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/point/ops/furthest_point_sampling.py b/mx_driving/point/ops/furthest_point_sampling.py
index 2b505b62..708251db 100644
--- a/mx_driving/point/ops/furthest_point_sampling.py
+++ b/mx_driving/point/ops/furthest_point_sampling.py
@@ -12,7 +12,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsFurthestPointSampling(Function):
@@ -22,7 +22,7 @@ class AdsFurthestPointSampling(Function):
         point_xyz = point_xyz.permute(0, 2, 1).contiguous()
 
         nearest_dist = torch.tensor(np.ones((B, N)) * 1e10, dtype=torch.float32, device='npu').contiguous()
-        output = ads_c.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points)
+        output = mx_driving._C.npu_furthest_point_sampling(point_xyz, nearest_dist, num_points)
 
         return output
 
diff --git a/mx_driving/point/ops/furthest_point_sampling_with_dist.py b/mx_driving/point/ops/furthest_point_sampling_with_dist.py
index ad9b906e..f56f104c 100644
--- a/mx_driving/point/ops/furthest_point_sampling_with_dist.py
+++ b/mx_driving/point/ops/furthest_point_sampling_with_dist.py
@@ -11,7 +11,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsFurthestPointSamplingWithDistFunction(Function):
@@ -19,7 +19,7 @@ class AdsFurthestPointSamplingWithDistFunction(Function):
     def forward(ctx, points_dist, num_points): 
         B, N = points_dist.size()[:2]
         nearest_temp = points_dist.new_zeros([B, N]).fill_(1e10)
-        result = ads_c.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points)
+        result = mx_driving._C.furthest_point_sampling_with_dist(points_dist, nearest_temp, num_points)
         return result
 
 furthest_point_sample_with_dist = AdsFurthestPointSamplingWithDistFunction.apply
diff --git a/mx_driving/point/ops/group_points.py b/mx_driving/point/ops/group_points.py
index b6b005f4..523ef73a 100644
--- a/mx_driving/point/ops/group_points.py
+++ b/mx_driving/point/ops/group_points.py
@@ -20,7 +20,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class AdsGroupPoints(Function):
@@ -45,7 +45,7 @@ class AdsGroupPoints(Function):
         B, C, N = features.size()
         _, npoints, nsample = indices.size()
 
-        output = ads_c.group_points(
+        output = mx_driving._C.group_points(
             features,
             indices,
             B,
@@ -70,7 +70,7 @@ class AdsGroupPoints(Function):
         idx, N = ctx.for_backwards
 
         B, C, npoints, nsample = grad_out.size()
-        grad_features = ads_c.group_points_backward(
+        grad_features = mx_driving._C.group_points_backward(
             grad_out,
             idx,
             B,
diff --git a/mx_driving/point/ops/npu_dynamic_scatter.py b/mx_driving/point/ops/npu_dynamic_scatter.py
index f1598c94..81ae7ff6 100644
--- a/mx_driving/point/ops/npu_dynamic_scatter.py
+++ b/mx_driving/point/ops/npu_dynamic_scatter.py
@@ -13,7 +13,7 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class DynamicScatterFunction(Function):
@@ -24,10 +24,10 @@ class DynamicScatterFunction(Function):
         if reduce_type not in ('max', 'sum', 'mean'):
             raise ValueError("reduce_type should be 'max', 'sum' or 'mean', but now is %s." % reduce_type)
 
-        voxel_idx = ads_c.point_to_voxel(coors, [], [], "XYZ")
-        num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = ads_c.unique_voxel(voxel_idx)
-        voxel_coors = ads_c.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ")
-        voxel_feats, compare_mask = ads_c.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel,
+        voxel_idx = mx_driving._C.point_to_voxel(coors, [], [], "XYZ")
+        num_voxels, uniqued_voxel_idx, prefix_sum_point_per_voxel, argsort_coor, _ = mx_driving._C.unique_voxel(voxel_idx)
+        voxel_coors = mx_driving._C.voxel_to_point(uniqued_voxel_idx, [], [], "XYZ")
+        voxel_feats, compare_mask = mx_driving._C.npu_dynamic_scatter(feats, coors, prefix_sum_point_per_voxel,
                                                               argsort_coor, num_voxels, reduce_type)
 
         ctx.reduce_type = reduce_type
@@ -44,7 +44,7 @@ class DynamicScatterFunction(Function):
                  grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
         (prefix_sum_point_per_voxel, argsort_coor, compare_mask) = ctx.saved_tensors
         grad_point_feats = torch.zeros(ctx.feats_shape, dtype=grad_voxel_feats.dtype, device=grad_voxel_feats.device)
-        ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel,
+        mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(), prefix_sum_point_per_voxel,
                                        argsort_coor, compare_mask, ctx.reduce_type)
         return grad_point_feats, None, None
 
diff --git a/mx_driving/point/ops/voxel_pooling_train.py b/mx_driving/point/ops/voxel_pooling_train.py
index 898c3930..12c61c9d 100644
--- a/mx_driving/point/ops/voxel_pooling_train.py
+++ b/mx_driving/point/ops/voxel_pooling_train.py
@@ -9,7 +9,7 @@ Modification 1. Add support for Ascend NPU
 import torch
 from torch.autograd import Function
 from torch.nn import Module
-import ads_c
+import mx_driving._C
 
 
 class AdsVoxelPoolingFunction(Function):
@@ -25,7 +25,7 @@ class AdsVoxelPoolingFunction(Function):
         output_features = input_features.new_zeros(batch_size, voxel_num[1], 
                                                    voxel_num[0], num_channels)
         pos_memo = geom_xyz.new_ones(batch_size, num_points, 3) * -1
-        pos, result = ads_c.voxel_pooling_train(
+        pos, result = mx_driving._C.voxel_pooling_train(
             input_features,
             geom_xyz,
             output_features,
@@ -51,7 +51,7 @@ class AdsVoxelPoolingFunction(Function):
         H = grad_output_features.shape[2]
         W = grad_output_features.shape[3]
 
-        result = ads_c.voxel_pool_train_backward(
+        result = mx_driving._C.voxel_pool_train_backward(
             grad_output_features,
             pos_memo,
             batch_size,
diff --git a/mx_driving/point/ops/voxelization.py b/mx_driving/point/ops/voxelization.py
index 22b7e6a0..3710dda8 100644
--- a/mx_driving/point/ops/voxelization.py
+++ b/mx_driving/point/ops/voxelization.py
@@ -6,7 +6,7 @@ from typing import Union, Tuple
 import torch
 from torch.autograd import Function
 from torch.nn import Module
-import ads_c
+import mx_driving._C
 
 
 class _Voxelization(Function):
@@ -17,7 +17,7 @@ class _Voxelization(Function):
     ):
 
         if max_points != -1 and max_voxels != -1:
-            return ads_c.hard_voxelize(points, voxel_size, coors_range, max_points, max_voxels)
+            return mx_driving._C.hard_voxelize(points, voxel_size, coors_range, max_points, max_voxels)
 
         float_espolin = 1e-9
         if voxel_size[0] < float_espolin or voxel_size[1] < float_espolin or voxel_size[2] < float_espolin:
@@ -30,7 +30,7 @@ class _Voxelization(Function):
 
         # create coors
         coors = points.new_zeros(size=(3, points.size(0)), dtype=torch.int)
-        result = ads_c.dynamic_voxelization(
+        result = mx_driving._C.dynamic_voxelization(
             points,
             coors,
             grid_x,
diff --git a/mx_driving/preprocess/CMakeLists.txt b/mx_driving/preprocess/CMakeLists.txt
index 3f1ac043..63ebf651 100644
--- a/mx_driving/preprocess/CMakeLists.txt
+++ b/mx_driving/preprocess/CMakeLists.txt
@@ -2,6 +2,10 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
   add_subdirectory(ops/kernels)
 endif()
 
-if (${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
+if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
   add_subdirectory(ops/onnx/plugin)
 endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/preprocess/ops/csrc/CMakeLists.txt b/mx_driving/preprocess/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/preprocess/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/preprocess/ops/npu_points_in_box.py b/mx_driving/preprocess/ops/npu_points_in_box.py
index df56e264..056df051 100644
--- a/mx_driving/preprocess/ops/npu_points_in_box.py
+++ b/mx_driving/preprocess/ops/npu_points_in_box.py
@@ -11,13 +11,13 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class PointsInBoxFunction(Function):
     @staticmethod
     def forward(ctx, boxes, pts):
-        result = ads_c.npu_points_in_box(boxes, pts)
+        result = mx_driving._C.npu_points_in_box(boxes, pts)
         ctx.save_for_backward(result)
         return result
 
diff --git a/mx_driving/preprocess/ops/npu_points_in_box_all.py b/mx_driving/preprocess/ops/npu_points_in_box_all.py
index 93120303..8f31e175 100644
--- a/mx_driving/preprocess/ops/npu_points_in_box_all.py
+++ b/mx_driving/preprocess/ops/npu_points_in_box_all.py
@@ -12,13 +12,13 @@ from torch.autograd import Function
 from torch.nn import Module
 
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class PointsInBoxAllFunction(Function):
     @staticmethod
     def forward(ctx, boxes, pts):
-        result = ads_c.npu_points_in_box_all(boxes, pts)
+        result = mx_driving._C.npu_points_in_box_all(boxes, pts)
         ctx.save_for_backward(result)
         return result
 
diff --git a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py b/mx_driving/preprocess/ops/npu_roipoint_pool3d.py
index c5c3d6e5..eae3c110 100644
--- a/mx_driving/preprocess/ops/npu_roipoint_pool3d.py
+++ b/mx_driving/preprocess/ops/npu_roipoint_pool3d.py
@@ -10,7 +10,7 @@ import torch
 from torch.autograd import Function
 from torch.nn import Module
 import torch_npu
-import ads_c
+import mx_driving._C
 
 
 class RoipointPool3dFunction(Function):
@@ -43,7 +43,7 @@ class RoipointPool3dFunction(Function):
         # pooled_features = points.new_zeros((batch_size, boxes_num, num_sampled_points, 3 + feature_len))
         # pooled_empty_flag = points.new_zeros((batch_size, boxes_num), dtype=torch.int)
         pooled_features, pooled_empty_flag = \
-            ads_c.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d)
+            mx_driving._C.npu_roipoint_pool3d_forward(num_sampled_points, points, point_features, boxes3d)
         return pooled_features, pooled_empty_flag
 
 
diff --git a/mx_driving/spconv/CMakeLists.txt b/mx_driving/spconv/CMakeLists.txt
index 621d1fa9..63ebf651 100644
--- a/mx_driving/spconv/CMakeLists.txt
+++ b/mx_driving/spconv/CMakeLists.txt
@@ -1,3 +1,11 @@
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/kernels)
   add_subdirectory(ops/kernels)
 endif()
+
+if(${ENABLE_ONNX} AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/onnx)
+  add_subdirectory(ops/onnx/plugin)
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ops/csrc)
+  add_subdirectory(ops/csrc)
+endif()
diff --git a/mx_driving/spconv/ops/csrc/CMakeLists.txt b/mx_driving/spconv/ops/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4a75d495
--- /dev/null
+++ b/mx_driving/spconv/ops/csrc/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB CSRC_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_CSRC_SRC
+    ${ASCEND_CSRC_SRC} ${CSRC_SRC}
+    CACHE INTERNAL "")
diff --git a/mx_driving/spconv/ops/sparse_functional.py b/mx_driving/spconv/ops/sparse_functional.py
index eeae65a3..65008c8b 100644
--- a/mx_driving/spconv/ops/sparse_functional.py
+++ b/mx_driving/spconv/ops/sparse_functional.py
@@ -19,7 +19,7 @@ import torch
 import numpy as np
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
-import ads_c
+import mx_driving._C
 from . import sparse_ops as ops
 
 
@@ -34,7 +34,7 @@ class SparseConvFunction(Function):
 
         device = features.device
         # calculate the index pair
-        outidx_pair, ouidx_offset = ads_c.npu_sparse_conv3d(indices, kernel_size, stride, padding,
+        outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_conv3d(indices, kernel_size, stride, padding,
                                                             out_channels, out_spatial_shape, batch_size)
         # sort and nonezero
         to_insert = torch.tensor(-1).to(device)
@@ -44,7 +44,7 @@ class SparseConvFunction(Function):
         sub_result = new_sorted_idx - new_sorted_idx_2
         unique_indices_offset = torch.nonzero(sub_result != 0)
         # index_put and matmul
-        out_features, outidx = ads_c.multi_to_sparse_v2(features, weight, unique_indices_offset.int(),
+        out_features, outidx = mx_driving._C.multi_to_sparse_v2(features, weight, unique_indices_offset.int(),
                                                   sorted_idx_to_former_indices.int(), outidx_pair.int())
         outidx, outidx_ = torch.chunk(outidx, 2, dim=1)
         if bias is not None:
@@ -57,7 +57,7 @@ class SparseConvFunction(Function):
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
         features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset,
+        weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset,
                                                                  sorted_idx_to_former_indices,
                                                                  features, weight, grad_out_features)
 
@@ -75,7 +75,7 @@ class SparseInverseConvFunction(Function):
                 groups, bias) -> torch.Tensor:
         device = features.device
         # calculate the index pair
-        out_features, outidx_pair, ouidx_offset = ads_c.npu_sparse_inverse_conv3d(features, indices, weight,
+        out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(features, indices, weight,
                                         kernel_size, stride, padding, dilation, output_padding,
                                         out_channels, out_spatial_shape, batch_size)
         # sort and nonezero
@@ -86,7 +86,7 @@ class SparseInverseConvFunction(Function):
         sub_result = new_sorted_idx - new_sorted_idx_2
         unique_indices_offset = torch.nonzero(sub_result != 0)
         # matmul
-        out_features, outidx = ads_c.multi_to_sparse(out_features, unique_indices_offset.int(),
+        out_features, outidx = mx_driving._C.multi_to_sparse(out_features, unique_indices_offset.int(),
                                                      sorted_idx_to_former_indices.int(), outidx_pair.int())
         outidx, outidx_ = torch.chunk(outidx, 2, dim=1)
         if bias is not None:
@@ -99,7 +99,7 @@ class SparseInverseConvFunction(Function):
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
         features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset,
+        weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset,
                                                                 sorted_idx_to_former_indices,
                                                                 features, weight, grad_out_features)
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None
@@ -119,7 +119,7 @@ class SubMConvFunction(Function):
         # calculate the index pair
         hh = indices[:, 0] * out_spatial_shape[0] * out_spatial_shape[1] * out_spatial_shape[2] + \
             indices[:, 1] * out_spatial_shape[1] * out_spatial_shape[2] + indices[:, 2] * out_spatial_shape[2] + indices[:, 3]
-        temp, hh2 = ads_c.npu_prepare_subm_conv3d(hh, out_spatial_shape, batch_size)
+        temp, hh2 = mx_driving._C.npu_prepare_subm_conv3d(hh, out_spatial_shape, batch_size)
         temp[hh] = hh2
         # pad the feature and weight become align
         feature_align = features.shape[1] % 8
@@ -128,7 +128,7 @@ class SubMConvFunction(Function):
             zero_tensor = torch.zeros((kernel_size[0], kernel_size[0], kernel_size[0], 8 - feature_align, out_channels)).to(device)
             weight_pad = torch.cat((weight, zero_tensor), 3)
         # calculate the out_feature
-        out_features, outidx_pair, ouidx_offset = ads_c.npu_subm_sparse_conv3d(features, indices, weight_pad,
+        out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_subm_sparse_conv3d(features, indices, weight_pad,
                                                                                 kernel_size, out_channels,
                                                                                 out_spatial_shape, batch_size, temp)
 
@@ -148,7 +148,7 @@ class SubMConvFunction(Function):
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
     def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx = None) -> tuple:
         features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        weight_grad, feature_grad = ads_c.npu_sparse_conv3d_grad(unique_indices_offset,
+        weight_grad, feature_grad = mx_driving._C.npu_sparse_conv3d_grad(unique_indices_offset,
                                                                 sorted_idx_to_former_indices,
                                                                 features, weight, grad_out_features)
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None
diff --git a/setup.py b/setup.py
index 7850aae5..dd3f6ebd 100644
--- a/setup.py
+++ b/setup.py
@@ -1,43 +1,179 @@
-import glob
+import multiprocessing
 import os
+import platform
+import stat
 import subprocess
+import sys
 from pathlib import Path
 from typing import Union
 
 import torch
-from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension 
-
-from utils import extension
+from setuptools import Extension, find_packages, setup
+from setuptools._distutils.version import LooseVersion
+from setuptools.command.build_clib import build_clib
+from setuptools.command.build_ext import build_ext
+from setuptools.command.develop import develop
 
 BASE_DIR = os.path.dirname(os.path.realpath(__file__))
-VERSION = torch.__version__
-full_components = ["common", "preprocess", "fused", "point", "detection", "spconv"]
-source_file = glob.glob(os.path.join("./bind/", "*.cpp"))
-include_dirs = [os.path.join(BASE_DIR, "include")]
-for part in full_components:
-    source_file += glob.glob(os.path.join(f"./mx_driving/{part}/ops/csrc/", "*.cpp"))
-
-exts = []
-ext1 = extension.NpuExtension(
-    name="ads_c",
-    sources=source_file,
-    include_dirs=include_dirs,
-    extra_compile_args=[
-        '-D__FILENAME__="$$(notdir $$(abspath $$<))"',
-        "-fprofile-arcs",
-        "-ftest-coverage",
-        "-fPIC",
-        "-fstack-protector-all",
-    ],
-    extra_link_args=[
-        "-Wl,-z,relro",
-        "-Wl,-z,now",
-        "-s"
-    ],
-    libraries=["gcov"],
-)
-exts.append(ext1)
+VERSION = "1.0.0"
+
+
+def which(thefile):
+    path = os.environ.get("PATH", os.defpath).split(os.pathsep)
+    for d in path:
+        fname = os.path.join(d, thefile)
+        fnames = [fname]
+        if sys.platform == "win32":
+            exts = os.environ.get("PATHEXT", "").split(os.pathsep)
+            fnames += [fname + ext for ext in exts]
+        for name in fnames:
+            if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name):
+                return name
+    return None
+
+
+def get_cmake_command():
+    def _get_version(cmd):
+        for line in subprocess.check_output([cmd, "--version"]).decode("utf-8").split("\n"):
+            if "version" in line:
+                return LooseVersion(line.strip().split(" ")[2])
+        raise RuntimeError("no version found")
+
+    "Returns cmake command."
+    cmake_command = "cmake"
+    if platform.system() == "Windows":
+        return cmake_command
+    cmake3 = which("cmake3")
+    cmake = which("cmake")
+    if cmake3 is not None and _get_version(cmake3) >= LooseVersion("3.19.0"):
+        cmake_command = "cmake3"
+        return cmake_command
+    elif cmake is not None and _get_version(cmake) >= LooseVersion("3.19.0"):
+        return cmake_command
+    else:
+        raise RuntimeError("no cmake or cmake3 with version >= 3.19.0 found")
+
+
+def get_build_type():
+    build_type = "Release"
+    if os.getenv("DEBUG", default="0").upper() in ["ON", "1", "YES", "TRUE", "Y"]:
+        build_type = "Debug"
+
+    if os.getenv("REL_WITH_DEB_INFO", default="0").upper() in ["ON", "1", "YES", "TRUE", "Y"]:
+        build_type = "RelWithDebInfo"
+
+    return build_type
+
+
+class CPPLibBuild(build_clib):
+    def initialize_options(self) -> None:
+        super().initialize_options()
+        self.kernel_name = None
+
+    def run(self) -> None:
+        cmake = get_cmake_command()
+        if not cmake:
+            raise RuntimeError("CMake must be installed to build the libraries")
+        self.cmake = cmake
+
+        build_py = self.get_finalized_command("build_py")
+        mx_driving_dir = os.path.join(BASE_DIR, build_py.build_lib, build_py.get_package_dir("mx_driving"))
+        if not os.path.exists(mx_driving_dir):
+            os.makedirs(mx_driving_dir)
+
+        cmake_args = [
+            "--preset=default",
+            "-DCMAKE_BUILD_TYPE=Release",
+            "-B",
+            self.build_temp,
+            f"-DMX_DRIVING_PATH={mx_driving_dir}",
+            f"-DKERNEL_NAME={self.kernel_name if self.kernel_name else '*'}",
+        ]
+        build_args = ["--build", self.build_temp, f"-j{multiprocessing.cpu_count()}"]
+
+        for stage in range(2):
+            subprocess.check_call(
+                [self.cmake, BASE_DIR] + cmake_args + ["-DBUILD_STAGE=" + str(stage)],
+                cwd=BASE_DIR,
+                env=os.environ,
+            )
+            subprocess.check_call(
+                [self.cmake] + build_args,
+                cwd=BASE_DIR,
+                env=os.environ,
+            )
+
+
+class ExtBuild(build_ext):
+    def run(self) -> None:
+        cmake = get_cmake_command()
+        if not cmake:
+            raise RuntimeError("CMake must be installed to build the libraries")
+        self.cmake = cmake
+
+        build_py = self.get_finalized_command("build_py")
+        mx_driving_dir = os.path.join(BASE_DIR, build_py.build_lib, build_py.get_package_dir("mx_driving"))
+        if not os.path.exists(mx_driving_dir):
+            os.makedirs(mx_driving_dir)
+
+        ext_cxx_flags = ["-std=c++17"]
+        for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+            val = getattr(torch._C, f"_PYBIND11_{name}")
+            if val:
+                ext_cxx_flags.append(f"-D_PYBIND11_{name}={val}")
+
+        cmake_args = [
+            "--preset=default",
+            "-DCMAKE_BUILD_TYPE=Release",
+            "-B",
+            self.build_temp,
+            f"-DMX_DRIVING_PATH={mx_driving_dir}",
+            f"-DEXT_CXX_FLAGS={' '.join(ext_cxx_flags)}",
+            f"-DPython3_EXECUTABLE={sys.executable}",
+        ]
+        if LooseVersion(torch.__version__) < LooseVersion("2.1.0"):
+            cmake_args.append("-DCOMPILE_WITH_XLA:BOOL=ON")
+        build_args = ["--build", self.build_temp, f"-j{multiprocessing.cpu_count()}"]
+
+        subprocess.check_call(
+            [self.cmake, BASE_DIR] + cmake_args + ["-DBUILD_STAGE=2"],
+            cwd=BASE_DIR,
+            env=os.environ,
+        )
+        subprocess.check_call(
+            [self.cmake] + build_args,
+            cwd=BASE_DIR,
+            env=os.environ,
+        )
+
+
+class DevelopBuild(develop):
+    user_options = develop.user_options + [("kernel-name=", None, "Build the single kernel with the specified name")]
+
+    def initialize_options(self) -> None:
+        super().initialize_options()
+        self.kernel_name = None
+
+    def install_for_development(self) -> None:
+        self.reinitialize_command("build_py", build_lib="")
+        self.reinitialize_command("build_clib", kernel_name=self.kernel_name)
+
+        if self.kernel_name:
+            self.run_command("build_clib")
+            return
+
+        self.run_command("egg_info")
+        self.run_command("build_clib")
+        self.run_command("build_ext")
+
+        if not self.dry_run:
+            with os.fdopen(
+                os.open(self.egg_link, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, stat.S_IWUSR | stat.S_IRUSR),
+                "w",
+                encoding="utf-8",
+            ) as f:
+                f.write(self.egg_path + "\n" + self.setup_path)
+        self.process_distribution(None, self.dist, not self.no_deps)
 
 
 def get_sha(pytorch_root: Union[str, Path]) -> str:
@@ -49,9 +185,7 @@ def get_sha(pytorch_root: Union[str, Path]) -> str:
         return "Unknown"
 
 
-VERSION = "1.0.0"
-ads_root = Path(__file__).parent
-sha = get_sha(ads_root)
+sha = get_sha(BASE_DIR)
 if not os.getenv("BUILD_WITHOUT_SHA"):
     VERSION += "+git" + sha[:7]
 
@@ -60,9 +194,14 @@ setup(
     version=VERSION,
     description="A Library of acceleration for autonomous driving systems on Ascend-NPU.",
     keywords="mx_driving",
-    ext_modules=exts,
+    ext_modules=[Extension("mx_driving._C", sources=[])],
     author="Ascend Contributors",
-    cmdclass={"build_ext": BuildExtension},
+    libraries=[("mx_driving", {"sources": []})],
+    cmdclass={
+        "build_clib": CPPLibBuild,
+        "build_ext": ExtBuild,
+        "develop": DevelopBuild,
+    },
     packages=find_packages(),
     include_package_data=True,
 )
diff --git a/tests/torch/test_bev_pool_v2.py b/tests/torch/test_bev_pool_v2.py
index 9870e4ff..ca27c9f5 100644
--- a/tests/torch/test_bev_pool_v2.py
+++ b/tests/torch/test_bev_pool_v2.py
@@ -3,9 +3,9 @@ import unittest
 import numpy as np
 import torch
 import torch_npu
-from ads_c import npu_bev_pool_v2_backward
 from torch_npu.testing.testcase import TestCase, run_tests
 
+from mx_driving._C import npu_bev_pool_v2_backward
 from mx_driving.point import bev_pool_v2
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@@ -13,7 +13,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 # pylint: disable=too-many-arguments,huawei-too-many-arguments
 def golden_bev_pool_v2(
-        depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c
+    depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c
 ):
     output = np.zeros((b, d, h, w, c), dtype=np.float32)
     depth = depth.flatten()
@@ -29,7 +29,7 @@ def golden_bev_pool_v2(
 
 # pylint: disable=too-many-arguments,huawei-too-many-arguments
 def golden_bev_pool_v2_grad(
-        grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c
+    grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, b, d, h, w, c
 ):
     grad_depth = np.zeros_like(depth).flatten()
     grad_feat = np.zeros_like(feat).reshape((-1, c))
@@ -63,10 +63,18 @@ class TestBEVPoolV2(TestCase):
 
     @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `bev_pool` is only supported on 910B, skip this ut!")
     def test_bev_pool_v2(self):
-        shapes = [[1, 1, 1, 1, 1, 1], [3, 3, 3, 3, 3, 3], [3, 3, 15, 15, 17, 33], [1, 5, 128, 128, 31, 777], [32, 4, 128, 128, 64, 9999]]
+        shapes = [
+            [1, 1, 1, 1, 1, 1],
+            [3, 3, 3, 3, 3, 3],
+            [3, 3, 15, 15, 17, 33],
+            [1, 5, 128, 128, 31, 777],
+            [32, 4, 128, 128, 64, 9999],
+        ]
         for shape in shapes:
             B, D, H, W, C, N_RANKS = shape
-            feat, depth, grad_out, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape = generate_bev_pool_data(B, D, H, W, C, N_RANKS)
+            feat, depth, grad_out, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape = generate_bev_pool_data(
+                B, D, H, W, C, N_RANKS
+            )
             kept = np.ones(ranks_bev.shape[0], dtype=bool)
             kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
             interval_starts = np.where(kept)[0].astype(np.int32)
@@ -111,7 +119,19 @@ class TestBEVPoolV2(TestCase):
                 W,
             )
             grad_feat = golden_bev_pool_v2_grad(
-                grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, B, D, H, W, C
+                grad_out,
+                depth,
+                feat,
+                ranks_depth,
+                ranks_feat,
+                ranks_bev,
+                interval_starts,
+                interval_lengths,
+                B,
+                D,
+                H,
+                W,
+                C,
             )
             self.assertRtolEqual(bev_feat.detach().cpu().numpy(), bev_feat_cpu)
             self.assertRtolEqual(grad_feat_npu.cpu().numpy(), grad_feat)
diff --git a/tests/torch/test_furthest_point_sampling.py b/tests/torch/test_furthest_point_sampling.py
index a637d8eb..5bb718e1 100644
--- a/tests/torch/test_furthest_point_sampling.py
+++ b/tests/torch/test_furthest_point_sampling.py
@@ -155,8 +155,7 @@ class TestFurthestPointSample(TestCase):
         self.compare_res(test1)
         self.compare_res(test2)
         self.compare_res(test3)
-        self.compare_res(test4)
 
 
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/tests/torch/test_group_points_grad.py b/tests/torch/test_group_points_grad.py
index 2dd7d3b9..8bf61d3e 100644
--- a/tests/torch/test_group_points_grad.py
+++ b/tests/torch/test_group_points_grad.py
@@ -4,8 +4,7 @@ import numpy as np
 
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
-import mx_driving
+import mx_driving._C
 
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@@ -50,7 +49,7 @@ class TestGroupPointsGrad(TestCase):
 
                             golden_grad_features = self.golden_group_points_grad(
                                 np_grad_out, np_indices, np_grad_features, B, npoints, nsample)
-                            npu_grad_features = ads_c.group_points_backward(torch_grad_out, torch_indices, B, C, N,
+                            npu_grad_features = mx_driving._C.group_points_backward(torch_grad_out, torch_indices, B, C, N,
                                                                             npoints, nsample)
 
                             self.assertRtolEqual(golden_grad_features, npu_grad_features.cpu().numpy())
diff --git a/tests/torch/test_hard_voxelize.py b/tests/torch/test_hard_voxelize.py
index de10230e..b58dc41b 100644
--- a/tests/torch/test_hard_voxelize.py
+++ b/tests/torch/test_hard_voxelize.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
+import mx_driving._C
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
@@ -22,7 +22,7 @@ class TestHardVoxelize(TestCase):
 
     def npu_hard_voxelize(self, points):
         points_npu = torch.from_numpy(points.astype(np.float32)).npu()
-        cnt, pts, voxs, num_per_vox = ads_c.hard_voxelize(
+        cnt, pts, voxs, num_per_vox = mx_driving._C.hard_voxelize(
             points_npu, [0.075, 0.075, 0.2], [-54, -54, -5, 54, 54, 5], 10, 1000
         )
         return cnt, voxs.cpu().numpy()
diff --git a/tests/torch/test_npu_dynamic_scatter.py b/tests/torch/test_npu_dynamic_scatter.py
index bca00da3..eb6a15d1 100644
--- a/tests/torch/test_npu_dynamic_scatter.py
+++ b/tests/torch/test_npu_dynamic_scatter.py
@@ -6,7 +6,7 @@ import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.testing.common_utils import create_common_tensor
 
-import ads_c
+import mx_driving._C
 import mx_driving.point
 
 
@@ -119,7 +119,7 @@ class TestDynamicScatter(TestCase):
 
         self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(),
                               argsort_coor.cpu(), compare_mask.cpu()], reduce_type)
-        ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
+        mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
                                        prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type)
         self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy())
 
@@ -141,7 +141,7 @@ class TestDynamicScatter(TestCase):
 
         self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(),
                               argsort_coor.cpu(), compare_mask.cpu()], reduce_type)
-        ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
+        mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
                                        prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type)
         self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy())
 
@@ -163,7 +163,7 @@ class TestDynamicScatter(TestCase):
 
         self.grad_cpu_op_exec([golden_result, grad_voxel_feats.contiguous().cpu(), prefix_sum_point_per_voxel.cpu(),
                               argsort_coor.cpu(), compare_mask.cpu()], reduce_type)
-        ads_c.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
+        mx_driving._C.npu_dynamic_scatter_grad(grad_point_feats, grad_voxel_feats.contiguous(),
                                        prefix_sum_point_per_voxel, argsort_coor, compare_mask, reduce_type)
         self.assertRtolEqual(golden_result.cpu().numpy(), grad_point_feats.cpu().numpy())
 
diff --git a/tests/torch/test_point_to_voxel.py b/tests/torch/test_point_to_voxel.py
index c01d134d..ce8a0578 100644
--- a/tests/torch/test_point_to_voxel.py
+++ b/tests/torch/test_point_to_voxel.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
+import mx_driving._C
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
@@ -32,7 +32,7 @@ class TestPointToVoxel(TestCase):
 
     def npu_encode(self, coords):
         coords_npu = torch.from_numpy(coords.view(np.float32)).npu()
-        return ads_c.point_to_voxel(coords_npu, [], [], "XYZ").cpu().numpy().view(np.int32)
+        return mx_driving._C.point_to_voxel(coords_npu, [], [], "XYZ").cpu().numpy().view(np.int32)
 
     @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!")
     def test_point_to_voxel(self):
diff --git a/tests/torch/test_unique_voxel.py b/tests/torch/test_unique_voxel.py
index d49cbe68..ba0e36b5 100644
--- a/tests/torch/test_unique_voxel.py
+++ b/tests/torch/test_unique_voxel.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
+import mx_driving._C
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
@@ -24,7 +24,7 @@ class TestUniqueVoxel(TestCase):
 
     def npu_unique(self, voxels):
         voxels_npu = torch.from_numpy(voxels).npu()
-        cnt, uni_vox, _, _, _ = ads_c.unique_voxel(voxels_npu)
+        cnt, uni_vox, _, _, _ = mx_driving._C.unique_voxel(voxels_npu)
         return cnt, uni_vox.cpu().numpy()
 
     def gen_integration(self, point_num):
@@ -55,9 +55,9 @@ class TestUniqueVoxel(TestCase):
 
     def npu_integration(self, coords):
         coords_npu = torch.from_numpy(coords.view(np.float32)).npu()
-        voxels_npu = ads_c.point_to_voxel(coords_npu, [], [], "XYZ")
-        cnt, uni_vox, _, _, _ = ads_c.unique_voxel(voxels_npu)
-        dec = ads_c.voxel_to_point(uni_vox, [], [], "XYZ")
+        voxels_npu = mx_driving._C.point_to_voxel(coords_npu, [], [], "XYZ")
+        cnt, uni_vox, _, _, _ = mx_driving._C.unique_voxel(voxels_npu)
+        dec = mx_driving._C.voxel_to_point(uni_vox, [], [], "XYZ")
         return cnt, dec.cpu().numpy()
 
     @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!")
diff --git a/tests/torch/test_vec_pool_backward.py b/tests/torch/test_vec_pool_backward.py
index 2125ba11..ce128a21 100644
--- a/tests/torch/test_vec_pool_backward.py
+++ b/tests/torch/test_vec_pool_backward.py
@@ -4,7 +4,7 @@ import numpy as np
 
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
+import mx_driving._C
 
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@@ -60,7 +60,7 @@ class TestVecPoolGrad(TestCase):
 
             golden_grad_support_features = self.golden_vec_pool_backward(
                 np_grad_new_features, np_point_cnt_of_grid, np_grouped_idxs, np_grad_support_features)
-            real_grad_support_features = ads_c.vec_pool_backward(
+            real_grad_support_features = mx_driving._C.vec_pool_backward(
                 torch_grad_new_features, torch_point_cnt_of_grid, torch_grouped_idxs, n, c_in)
 
             self.assertRtolEqual(golden_grad_support_features, real_grad_support_features.cpu().numpy())
diff --git a/tests/torch/test_voxel_pooling_train.py b/tests/torch/test_voxel_pooling_train.py
index ac372802..e713523d 100644
--- a/tests/torch/test_voxel_pooling_train.py
+++ b/tests/torch/test_voxel_pooling_train.py
@@ -1,25 +1,26 @@
-import unittest
 import copy
+import unittest
+
+import numpy as np
 import torch
 import torch_npu
-
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
-import numpy as np
+
 import mx_driving.point
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 
 # pylint: disable=too-many-arguments,huawei-too-many-arguments
-def voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, num_voxel_x,
-                            num_voxel_y, num_voxel_z, geom_xyz, input_features):
+def voxel_pooling_train_cpu_forward(
+    batch_size, num_points, num_channels, num_voxel_x, num_voxel_y, num_voxel_z, geom_xyz, input_features
+):
     dtype = input_features.dtype
     pos_memo = torch.zeros((batch_size, num_points, 3), dtype=torch.int32) * -1
     output_features = torch.zeros((batch_size, num_voxel_y, num_voxel_x, num_channels), dtype=dtype)
     for i in range(batch_size):
         for j in range(num_points):
-            
+
             sample_x = geom_xyz[i][j][0]
             sample_y = geom_xyz[i][j][1]
             sample_z = geom_xyz[i][j][2]
@@ -29,7 +30,7 @@ def voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, num_vo
             if sample_y < 0 or sample_y >= num_voxel_y:
                 continue
             if sample_z < 0 or sample_z >= num_voxel_z:
-                continue  
+                continue
 
             pos_memo[i][j][0] = i
             pos_memo[i][j][1] = geom_xyz[i][j][1]
@@ -44,14 +45,11 @@ def voxel_pooling_train_cpu_backward(pos, result_cpu, grad_features):
     features_shape = grad_features.shape
     mask = (pos != -1)[..., 0]
 
-    grad_features = grad_features.reshape(
-        grad_features.shape[0], -1, grad_features.shape[-1])
+    grad_features = grad_features.reshape(grad_features.shape[0], -1, grad_features.shape[-1])
 
-    grad_features[mask] = result_cpu[pos[mask][..., 0].long(
-    ), :, pos[mask][..., 1].long(), pos[mask][..., 2].long()]
+    grad_features[mask] = result_cpu[pos[mask][..., 0].long(), :, pos[mask][..., 1].long(), pos[mask][..., 2].long()]
 
-    grad_features = grad_features.reshape(
-        features_shape)
+    grad_features = grad_features.reshape(features_shape)
     return grad_features
 
 
@@ -60,24 +58,23 @@ class TestVoxelPoolingTrain(TestCase):
         batch_size = input_features.shape[0]
         num_points = input_features.shape[1]
         num_channels = input_features.shape[2]
-        pos, result = voxel_pooling_train_cpu_forward(batch_size, num_points, num_channels, voxel_num[0],
-                                                      voxel_num[1], voxel_num[2], geom_xyz, input_features)
+        pos, result = voxel_pooling_train_cpu_forward(
+            batch_size, num_points, num_channels, voxel_num[0], voxel_num[1], voxel_num[2], geom_xyz, input_features
+        )
 
         pos_memo = pos
         grad_features_cpu = torch.zeros_like(input_features)
-        grad_features_cpu = voxel_pooling_train_cpu_backward(
-            pos_memo, result, grad_features_cpu)
+        grad_features_cpu = voxel_pooling_train_cpu_backward(pos_memo, result, grad_features_cpu)
 
         return pos, result, grad_features_cpu
 
     def npu_to_exec(self, geom_xyz, input_features, voxel_num):
-        result = mx_driving.point.npu_voxel_pooling_train(
-            geom_xyz, input_features, voxel_num)
+        result = mx_driving.point.npu_voxel_pooling_train(geom_xyz, input_features, voxel_num)
 
         result.backward(result)
         grad_features_npu = input_features.grad
         return result, grad_features_npu
-    
+
     def gen_data(self, geom_shape, feature_shape, coeff, batch_size, num_channels, dtype):
         geom_xyz = torch.rand(geom_shape) * coeff
         geom_xyz = geom_xyz.reshape(batch_size, -1, 3)
@@ -89,18 +86,16 @@ class TestVoxelPoolingTrain(TestCase):
         features_npu = features_cpu.npu()
         features_npu.requires_grad = True
         return geom_xyz_cpu, features_cpu, geom_xyz_npu, features_npu
-    
-    @unittest.skipIf(DEVICE_NAME != 'Ascend910B', "OP `VoxelPoolingTrain` is only supported on 910B, skip this ut!")
+
+    @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `VoxelPoolingTrain` is only supported on 910B, skip this ut!")
     def test_voxel_pooling_train(self):
-        torch.npu.set_device('npu:0')
-        types = [torch.float32, ]
+        torch.npu.set_device("npu:0")
+        types = [
+            torch.float32,
+        ]
         batch_size_list = [1, 2]
         num_channels_list = [32, 80]
-        shape_list = [
-            [30, 25],
-            [25, 12, 40],
-            [20]
-        ]
+        shape_list = [[30, 25], [25, 12, 40], [20]]
         coeff = 90
         voxel_num = [128, 128, 1]
         # test
@@ -114,18 +109,19 @@ class TestVoxelPoolingTrain(TestCase):
                         feature_shape.append(num_channel)
                         geom_shape.append(3)
                         geom_cpu, feature_cpu, geom_npu, feature_npu = self.gen_data(
-                            geom_shape, feature_shape, coeff, batch_size, num_channel, dtype)
+                            geom_shape, feature_shape, coeff, batch_size, num_channel, dtype
+                        )
                         pos, cpu_result, cpu_grad_features = self.cpu_to_exec(geom_cpu, feature_cpu, voxel_num)
                         npu_result, npu_grad_features = self.npu_to_exec(geom_npu, feature_npu, voxel_num)
-                        
+
                         cpu_result = cpu_result.numpy()
                         npu_result = npu_result.detach().cpu().numpy()
                         self.assertRtolEqual(cpu_result, npu_result)
-                        
+
                         cpu_grad_features = cpu_grad_features.numpy()
                         npu_grad_features = npu_grad_features.cpu().numpy()
                         self.assertRtolEqual(cpu_grad_features, npu_grad_features)
 
 
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/tests/torch/test_voxel_to_point.py b/tests/torch/test_voxel_to_point.py
index 9dc1d0c6..35b21bff 100644
--- a/tests/torch/test_voxel_to_point.py
+++ b/tests/torch/test_voxel_to_point.py
@@ -4,7 +4,7 @@ import numpy as np
 import torch
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-import ads_c
+import mx_driving._C
 
 DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
@@ -29,7 +29,7 @@ class TestVoxelToPoint(TestCase):
 
     def npu_decode(self, voxels):
         voxels_npu = torch.from_numpy(voxels.view(np.int32)).npu()
-        return ads_c.voxel_to_point(voxels_npu, [], [], "XYZ").cpu().numpy()
+        return mx_driving._C.voxel_to_point(voxels_npu, [], [], "XYZ").cpu().numpy()
 
     @unittest.skipIf(DEVICE_NAME != "Ascend910B", "OP `PointToVoxel` is only supported on 910B, skip this ut!")
     def test_point_to_voxel(self):
diff --git a/utils/extension.py b/utils/extension.py
deleted file mode 100644
index 57908424..00000000
--- a/utils/extension.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2022 Huawei Technologies Co., Ltd
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import site
-from pkg_resources import parse_version
-
-import setuptools
-import torch
-import torch.utils.cpp_extension as TorchExtension
-
-try:
-    import torch_npu
-    PYTORCH_NPU_INSTALL_PATH = os.path.dirname(os.path.realpath(torch_npu.__file__))
-except:
-    site_packages_path = site.getsitepackages()
-    PYTORCH_NPU_INSTALL_PATH = site_packages_path[0] + "/torch_npu/"
-
-
-def NpuExtension(name, sources, *args, **kwargs):
-    r'''
-    Creates a :class:`setuptools.Extension` for C++.
-
-    Convenience method that creates a :class:`setuptools.Extension` with the
-    bare minimum (but often sufficient) arguments to build a C++ extension.
-
-    All arguments are forwarded to the :class:`setuptools.Extension`
-    constructor.
-
-    Example:
-        >>> from setuptools import setup
-        >>> from torch_npu.utils.cpp_extension import NpuExtension
-        >>> setup(
-                name='extension',
-                ext_modules=[
-                    NpuExtension(
-                        name='extension',
-                        sources=['extension.cpp'],
-                        extra_compile_args=['-g']),
-                ],
-                cmdclass={
-                    'build_ext': BuildExtension
-                })
-    '''
-
-    torch_npu_dir = PYTORCH_NPU_INSTALL_PATH
-    include_dirs = kwargs.get('include_dirs', [])
-    include_dirs.append(os.path.join(torch_npu_dir, 'include'))
-    include_dirs += TorchExtension.include_paths()
-    kwargs['include_dirs'] = include_dirs
-
-    library_dirs = kwargs.get('library_dirs', [])
-    library_dirs.append(os.path.join(torch_npu_dir, 'lib'))
-    library_dirs += TorchExtension.library_paths()
-    kwargs['library_dirs'] = library_dirs
-
-    libraries = kwargs.get('libraries', [])
-    libraries.append('c10')
-    libraries.append('torch')
-    libraries.append('torch_cpu')
-    libraries.append('torch_python')
-    libraries.append('torch_npu')
-    kwargs['libraries'] = libraries
-
-    kwargs['language'] = 'c++'
-
-    define_macros = []
-    if parse_version(torch.__version__) < parse_version('2.1.0'):
-        define_macros += [('COMPILE_WITH_XLA', None)]
-    kwargs['define_macros'] = define_macros
-
-    return setuptools.Extension(name, sources, *args, **kwargs)
-- 
Gitee