From eccf8dda7147b187b56675ee0bb29e279dc3ece4 Mon Sep 17 00:00:00 2001
From: ZYF-Annarine <zhangyifan115@huawei.com>
Date: Sat, 1 Mar 2025 16:09:23 +0800
Subject: [PATCH] delete SparseInverConv3d

---
 docs/api/README.md                            |   1 -
 docs/api/context/SparseInverseConv3d[beta].md |  60 ----
 include/csrc/functions.h                      |   5 -
 kernels/op_host/sparse_inverse_conv3d.cpp     | 218 -------------
 .../op_host/sparse_inverse_conv3d_tiling.h    |  80 -----
 kernels/op_kernel/sparse_inverse_conv3d.cpp   | 304 ------------------
 mx_driving/_C/__init__.pyi                    |  13 -
 mx_driving/__init__.py                        |   3 +-
 mx_driving/csrc/SparseInverseConv3d.cpp       |  54 ----
 mx_driving/csrc/pybind.cpp                    |   3 -
 mx_driving/modules/sparse_conv.py             |  57 +---
 mx_driving/ops/sparse_functional.py           |  74 +----
 mx_driving/spconv.py                          |   2 +-
 tests/torch/test_sparse_inverse_conv3d.py     | 123 -------
 14 files changed, 9 insertions(+), 988 deletions(-)
 delete mode 100644 docs/api/context/SparseInverseConv3d[beta].md
 delete mode 100644 kernels/op_host/sparse_inverse_conv3d.cpp
 delete mode 100644 kernels/op_host/sparse_inverse_conv3d_tiling.h
 delete mode 100644 kernels/op_kernel/sparse_inverse_conv3d.cpp
 delete mode 100644 mx_driving/csrc/SparseInverseConv3d.cpp
 delete mode 100644 tests/torch/test_sparse_inverse_conv3d.py

diff --git a/docs/api/README.md b/docs/api/README.md
index 16679c8e..3d1fc814 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -41,7 +41,6 @@
 - 稀疏
     - [SparseConv3d](./context/SparseConv3d.md)
     - [SubmSparseConv3d](./context/SubMConv3d.md)
-    - [SparseInverseConv3d[beta]](./context/SparseInverseConv3d[beta].md)
 - 融合
     - [multi_scale_deformable_attn](./context/multi_scale_deformable_attn.md)
     - [deformable_aggregation](./context/deformable_aggregation.md)
diff --git a/docs/api/context/SparseInverseConv3d[beta].md b/docs/api/context/SparseInverseConv3d[beta].md
deleted file mode 100644
index e32c7e6b..00000000
--- a/docs/api/context/SparseInverseConv3d[beta].md
+++ /dev/null
@@ -1,60 +0,0 @@
-## SparseInverseConv3d(beta)
-### 接口原型
-```python
-mx_driving.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
-```
-兼容
-```python
-mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor
-```
-### 功能描述
-稀疏逆卷积
-### 参数说明
-- `in_channels(int)`：输入数据的通道数
-- `out_channels(int)`：输出通道数
-- `kernel_size(List(int)/Tuple(int)/int)`：卷积神经网络中卷积核的大小
-- `stride(List(int)/Tuple(int)/int)`：卷积核在输入数据上滑动时的步长
-- `dilation(List(int)/Tuple(int)/int)`：空洞卷积大小
-- `groups(int)`：分组卷积
-- `bias(bool)`：偏置项
-- `indice_key(str)`：该输入用于复用之前计算的索引信息
-- `mode(str)`：区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积
-### 返回值
-- `SparseConvTensor`：存储了输出的特征值`out_feature`，对应索引位置`out_indices`和对应的spatital_shape。
-### 支持的型号
-- Atlas A2 训练系列产品
-### 约束说明
-- `kernel_size`当前支持数据类型为三维List/Tuple或Int，值域为`[1, 3]`
-- `stride`当前支持数据类型为三维List/Tuple或Int
-- `dilation`，`groups`当前仅支持值为1
-- 对于反向也是同样的约束。
-### 调用示例
-```python
-import torch,torch_npu
-import numpy as np
-from mx_driving import SparseInverseConv3d, SparseConvTensor
-
-def generate_indice(batch, height, width, depth, actual_num):
-    base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num]
-    base_indices = np.sort(base_indices)
-    b_indice = base_indices // (height * width * depth)
-    base_indices = base_indices % (height * width * depth)
-    h_indice = base_indices // (width * depth)
-    base_indices = base_indices % (width * depth)
-    w_indice = base_indices // depth
-    d_indice = base_indices % depth
-    indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num)
-    return indices
-
-actual_num = 20
-batch = 4
-spatial_shape = [9, 9, 9]
-indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu()
-feature = tensor_uniform = torch.rand(actual_num, 16).npu()
-feature.requires_grad = True
-x = SparseConvTensor(feature, indices, spatial_shape, batch)
-net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu()
-out = net(x)
-dout = torch.ones_like(out.features).float().npu()
-out.features.backward(dout)
-```
\ No newline at end of file
diff --git a/include/csrc/functions.h b/include/csrc/functions.h
index 5c8e1843..4963c8b4 100644
--- a/include/csrc/functions.h
+++ b/include/csrc/functions.h
@@ -177,11 +177,6 @@ std::tuple<at::Tensor, at::Tensor> multi_to_sparse_v2(const at::Tensor& features
 std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size,
     at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size);
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature,
-    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride,
-    at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel,
-    at::IntArrayRef outSpatialShape, int batch_size);
-
 std::tuple<at::Tensor, at::Tensor> npu_sparse_conv3d_grad(const at::Tensor& indices_offset,
     const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight,
     const at::Tensor& grad);
diff --git a/kernels/op_host/sparse_inverse_conv3d.cpp b/kernels/op_host/sparse_inverse_conv3d.cpp
deleted file mode 100644
index d7be9391..00000000
--- a/kernels/op_host/sparse_inverse_conv3d.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-
-#include "sparse_inverse_conv3d_tiling.h"
-#include "register/op_def_registry.h"
-#include "tiling/tiling_api.h"
-#include "tiling/platform/platform_ascendc.h"
-using namespace ge;
-
-namespace optiling {
-constexpr uint32_t BYTE_BLOCK = 32;
-constexpr uint32_t DTYPE_FP32_BLOCK = 8;
-constexpr uint32_t RESERVED_UB_SIZE = 16 * 1024;
-constexpr uint32_t OTHER_UB_NUMBER = 256;
-
-void SparseInverseConv3dTiling::CalUsedCoreNumAndCoreTask()
-{
-    coreTask = ((actualNum + coreNum - 1) / coreNum / BYTE_BLOCK) * BYTE_BLOCK;
-    usedCoreNum = (actualNum + coreTask - 1) / coreTask;
-    lastCoreTask = (actualNum % coreTask == 0) ? coreTask : actualNum % coreTask;
-}
-
-void SparseInverseConv3dTiling::CalAvailableUbTiling()
-{
-    // featureUb [moveLen, icAlignUp], indicesUb [moveLen, 4], outidxUb [moveLen, 27] outidxPairUb [moveLen, kernelSize, 4]
-    // otherUB The max Value of ic and oc is 256 -> mulUb + sumUb + tmpUb + workUb = 256 * 4 * sizeof(int32 or float32)
-    // weight shape is [..., oc, ic] ->  weightUb = oc * ic
-    uint64_t availableUbSize = ubSizePlatForm - RESERVED_UB_SIZE;
-    uint64_t ubAvailableNumber = availableUbSize / sizeof(float);
-    ubAvailableNumber -= OTHER_UB_NUMBER * 4;
-    ubAvailableNumber -= kernelIC * (kernelIC - DTYPE_FP32_BLOCK + 1) / DTYPE_FP32_BLOCK ;
-    uint32_t partNum = (kernelIC - DTYPE_FP32_BLOCK + 1) / DTYPE_FP32_BLOCK + 4 + 5 * kernelSize;
-    moveLen = ubAvailableNumber / partNum;
-    moveLen = moveLen / BYTE_BLOCK * BYTE_BLOCK;
-}
-
-void SparseInverseConv3dTiling::GetIntArrayList()
-{
-    auto attrsPtr = tilingContext->GetAttrs();
-    auto outSpatialShapePtr = attrsPtr->GetAttrPointer<gert::ContinuousVector>(0);
-    auto stridePtr = attrsPtr->GetAttrPointer<gert::ContinuousVector>(1);
-    auto paddingPtr = attrsPtr->GetAttrPointer<gert::ContinuousVector>(2);
-    auto dilationPtr = attrsPtr->GetAttrPointer<gert::ContinuousVector>(3);
-    auto outputPaddingPtr = attrsPtr->GetAttrPointer<gert::ContinuousVector>(4);
-    auto outSpatialShapeData = reinterpret_cast<const int64_t*>(outSpatialShapePtr->GetData());
-    auto strideData = reinterpret_cast<const int64_t*>(stridePtr->GetData());
-    auto paddingData = reinterpret_cast<const int64_t*>(paddingPtr->GetData());
-    auto dilationData = reinterpret_cast<const int64_t*>(paddingPtr->GetData());
-    auto outputPaddingData = reinterpret_cast<const int64_t*>(paddingPtr->GetData());
-    tilingData.set_outfeatureB(outSpatialShapeData[0]);
-    tilingData.set_outputDepth(outSpatialShapeData[1]);
-    tilingData.set_outputHeight(outSpatialShapeData[2]);
-    tilingData.set_outputWidth(outSpatialShapeData[3]);
-    tilingData.set_strideDepth(strideData[0]);
-    tilingData.set_strideHeight(strideData[1]);
-    tilingData.set_strideWidth(strideData[2]);
-    tilingData.set_paddingDepth(paddingData[0]);
-    tilingData.set_paddingHeight(paddingData[1]);
-    tilingData.set_paddingWidth(paddingData[2]);
-    tilingData.set_dilationDepth(dilationData[0]);
-    tilingData.set_dilationHeight(dilationData[1]);
-    tilingData.set_dilationWidth(dilationData[2]);
-    tilingData.set_outputPaddingDepth(outputPaddingData[0]);
-    tilingData.set_outputPaddingHeight(outputPaddingData[1]);
-    tilingData.set_outputPaddingWidth(outputPaddingData[2]);
-}
-
-ge::graphStatus SparseInverseConv3dTiling::Init()
-{
-    auto platformInfo = tilingContext->GetPlatformInfo();
-    auto attrsPtr = tilingContext->GetAttrs();
-    if (platformInfo == nullptr || attrsPtr == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
-    coreNum = ascendcPlatform.GetCoreNumAiv();
-    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSizePlatForm);
-    if (coreNum == 0) {
-        return ge::GRAPH_FAILED;
-    }
-    if (tilingContext->GetInputShape(0) == nullptr ||  tilingContext->GetInputShape(2) == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    auto feature_shape = tilingContext->GetInputShape(0)->GetStorageShape();
-    auto weight_shape = tilingContext->GetInputShape(2)->GetStorageShape();
-    actualNum = feature_shape.GetDim(0);
-    CalUsedCoreNumAndCoreTask();
-    kernelD = weight_shape.GetDim(0);
-    kernelH = weight_shape.GetDim(1);
-    kernelW = weight_shape.GetDim(2);
-    kernelOC = weight_shape.GetDim(3);
-    kernelIC = weight_shape.GetDim(4);
-    kernelSize = kernelD * kernelH * kernelW;
-    CalAvailableUbTiling();
-    GetIntArrayList();
-}
-
-ge::graphStatus SparseInverseConv3dTiling::RunKernelTiling()
-{
-    tilingContext->SetBlockDim(usedCoreNum);
-    tilingData.set_usedCoreNum(usedCoreNum);
-    tilingData.set_coreTask(coreTask);
-    tilingData.set_lastCoreTask(lastCoreTask);
-    tilingData.set_moveLen(moveLen);
-    tilingData.set_repeatTimes(repeatTimes);
-    tilingData.set_moveTail(moveTail);
-    tilingData.set_lastRepeatTimes(lastRepeatTimes);
-    tilingData.set_lastMoveTail(lastMoveTail);
-    tilingData.set_kernelD(kernelD);
-    tilingData.set_kernelH(kernelH);
-    tilingData.set_kernelW(kernelW);
-    tilingData.set_kernelIC(kernelIC);
-    tilingData.set_kernelOC(kernelOC);
-    tilingData.set_kernelSize(kernelSize);
-    if (tilingContext->GetRawTilingData() == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    auto ascendcPlatform = platform_ascendc::PlatformAscendC(tilingContext->GetPlatformInfo());
-    tilingData.SaveToBuffer(tilingContext->GetRawTilingData()->GetData(), tilingContext->GetRawTilingData()->GetCapacity());
-    tilingContext->GetRawTilingData()->SetDataSize(tilingData.GetDataSize());
-    uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
-    size_t* currentWorkspace = tilingContext->GetWorkspaceSizes(1);
-    currentWorkspace[0] = sysWorkspaceSize;
-    return ge::GRAPH_SUCCESS;
-}
-
-ge::graphStatus TilingForSparseInverseConv3d(gert::TilingContext* context)
-{
-    if (context == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    SparseInverseConv3dTiling tilingObject(context);
-    tilingObject.Init();
-    return tilingObject.RunKernelTiling();
-}
-} // namespace optiling
-
-namespace ge {
-static ge::graphStatus InferShape(gert::InferShapeContext* context)
-{
-    const gert::Shape* featureShape = context->GetInputShape(0);
-    const gert::Shape* indicesShape = context->GetInputShape(1);
-    const gert::Shape* weightShape = context->GetInputShape(2);
-    if (featureShape == nullptr || indicesShape == nullptr || weightShape == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    gert::Shape* outShape = context->GetOutputShape(0);
-    gert::Shape* indicesOutShape = context->GetOutputShape(1);
-    gert::Shape* indicesPairShape = context->GetOutputShape(2);
-    if (outShape == nullptr || indicesOutShape == nullptr || indicesPairShape == nullptr) {
-        return ge::GRAPH_FAILED;
-    }
-    uint64_t kernelSize = 1;
-    for (size_t i = 0; i < weightShape->GetDimNum() - 2; i++) {
-        kernelSize *= weightShape->GetDim(i);
-    }
-    uint64_t kernelOC = weightShape->GetDim(3);
-    uint64_t indicesSecondSize = indicesShape->GetDim(1);
-    *outShape = {kernelSize, kernelOC};
-    *indicesOutShape = {kernelSize};
-    *indicesPairShape = {kernelSize, indicesSecondSize};
-    return GRAPH_SUCCESS;
-}
-}
-
-namespace ops {
-class SparseInverseConv3d : public OpDef {
-public:
-    explicit SparseInverseConv3d(const char* name) : OpDef(name)
-    {
-        this->Input("features")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
-            .AutoContiguous();
-        this->Input("indices")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
-            .AutoContiguous();
-        this->Input("weight")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
-            .AutoContiguous();
-
-        this->Output("feature_out")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
-        this->Output("indices_out")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
-        this->Output("indices_pair")
-            .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
-
-        this->Attr("out_spatial_shape").ListInt();
-        this->Attr("stride").ListInt();
-        this->Attr("padding").ListInt();
-        this->Attr("dilation").ListInt();
-        this->Attr("output_padding").ListInt();
-
-        this->SetInferShape(ge::InferShape);
-        this->AICore().SetTiling(optiling::TilingForSparseInverseConv3d);
-        this->AICore().AddConfig("ascend910b");
-        this->AICore().AddConfig("ascend910_93");
-    }
-};
-
-OP_ADD(SparseInverseConv3d);
-}
\ No newline at end of file
diff --git a/kernels/op_host/sparse_inverse_conv3d_tiling.h b/kernels/op_host/sparse_inverse_conv3d_tiling.h
deleted file mode 100644
index c6165235..00000000
--- a/kernels/op_host/sparse_inverse_conv3d_tiling.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved.
- */
-#ifndef SPARSE_INVERSE_CONV3D_TILING_H
-#define SPARSE_INVERSE_CONV3D_TILING_H
-
-#include "register/op_def_registry.h"
-#include "register/tilingdata_base.h"
-#include "tiling/tiling_api.h"
-
-namespace optiling {
-BEGIN_TILING_DATA_DEF(SparseInverseConv3dTilingData)
-    TILING_DATA_FIELD_DEF(uint32_t, usedCoreNum)
-    TILING_DATA_FIELD_DEF(uint32_t, coreTask)
-    TILING_DATA_FIELD_DEF(uint32_t, lastCoreTask)
-    TILING_DATA_FIELD_DEF(uint32_t, moveLen)
-    TILING_DATA_FIELD_DEF(uint32_t, repeatTimes)
-    TILING_DATA_FIELD_DEF(uint32_t, moveTail)
-    TILING_DATA_FIELD_DEF(uint32_t, lastRepeatTimes)
-    TILING_DATA_FIELD_DEF(uint32_t, lastMoveTail)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelD)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelH)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelW)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelIC)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelOC)
-    TILING_DATA_FIELD_DEF(uint32_t, kernelSize)
-    TILING_DATA_FIELD_DEF(uint32_t, outfeatureB)
-    TILING_DATA_FIELD_DEF(uint32_t, outputDepth)
-    TILING_DATA_FIELD_DEF(uint32_t, outputHeight)
-    TILING_DATA_FIELD_DEF(uint32_t, outputWidth)
-    TILING_DATA_FIELD_DEF(uint32_t, strideDepth)
-    TILING_DATA_FIELD_DEF(uint32_t, strideHeight)
-    TILING_DATA_FIELD_DEF(uint32_t, strideWidth)
-    TILING_DATA_FIELD_DEF(uint32_t, paddingDepth)
-    TILING_DATA_FIELD_DEF(uint32_t, paddingHeight)
-    TILING_DATA_FIELD_DEF(uint32_t, paddingWidth)
-    TILING_DATA_FIELD_DEF(uint32_t, dilationDepth)
-    TILING_DATA_FIELD_DEF(uint32_t, dilationHeight)
-    TILING_DATA_FIELD_DEF(uint32_t, dilationWidth)
-    TILING_DATA_FIELD_DEF(uint32_t, outputPaddingDepth)
-    TILING_DATA_FIELD_DEF(uint32_t, outputPaddingHeight)
-    TILING_DATA_FIELD_DEF(uint32_t, outputPaddingWidth)
-END_TILING_DATA_DEF;
-
-REGISTER_TILING_DATA_CLASS(SparseInverseConv3d, SparseInverseConv3dTilingData)
-
-class SparseInverseConv3dTiling {
-public:
-    explicit SparseInverseConv3dTiling(gert::TilingContext* context) : tilingContext(context) {};
-    ge::graphStatus Init();
-    ge::graphStatus RunKernelTiling();
-
-private:
-    void CalUsedCoreNumAndCoreTask();
-    void CalAvailableUbTiling();
-    void GetIntArrayList();
-
-private:
-    SparseInverseConv3dTilingData tilingData;
-    gert::TilingContext* tilingContext = nullptr;
-    uint32_t coreNum;
-    uint32_t usedCoreNum;
-    uint32_t coreTask;
-    uint32_t lastCoreTask;
-    uint32_t actualNum;
-    uint32_t kernelD;
-    uint32_t kernelH;
-    uint32_t kernelW;
-    uint32_t kernelOC;
-    uint32_t kernelIC;
-    uint32_t kernelSize;
-    uint64_t ubSizePlatForm;
-    uint32_t moveLen;
-    uint32_t repeatTimes;
-    uint32_t moveTail;
-    uint32_t lastRepeatTimes;
-    uint32_t lastMoveTail;
-};
-} // namespace optiling
-#endif // SPARSE_INVERSE_CONV3D_TILING_H
diff --git a/kernels/op_kernel/sparse_inverse_conv3d.cpp b/kernels/op_kernel/sparse_inverse_conv3d.cpp
deleted file mode 100644
index 07d1e681..00000000
--- a/kernels/op_kernel/sparse_inverse_conv3d.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
- */
-
-#include "kernel_operator.h"
-using namespace AscendC;
-
-namespace {
-constexpr static int32_t BUFFER_NUM = 1;
-};
-
-class KernelSparseInverseConv3d {
-public:
-    __aicore__ inline KernelSparseInverseConv3d() {}
-    __aicore__ inline void Init(GM_ADDR features, GM_ADDR indices, GM_ADDR weight, GM_ADDR feature_out, GM_ADDR indices_out, GM_ADDR indices_pair, GM_ADDR workspace, SparseInverseConv3dTilingData *tiling_data, TPipe *tmpPipe)
-    {
-        pipe = tmpPipe;
-        curBlockIdx = GetBlockIdx();
-        // features dtype must be same with weight
-        dataAlign = blockBytes / sizeof(DTYPE_FEATURES);
-        maskAlign = dataAlign * 8;
-        initTilingData(tiling_data);
-        calculateReduceSum();
-        kernelICAlign = AlignUp(kernelIC, dataAlign);
-        uint64_t beginOffset = curBlockIdx * coreTask;
-
-        uint32_t valueBlockNum = blockBytes / sizeof(DTYPE_WEIGHT);
-        uint32_t idxBlockNum = blockBytes / sizeof(DTYPE_INDICES);
-
-        if (curBlockIdx < usedCoreNum - 1) {
-            taskNum = coreTask;
-            coreRepeatTimes = repeatTimes;
-            coreMoveTail = moveTail;
-        } else {
-            taskNum = lastCoreTask;
-            coreRepeatTimes = lastRepeatTimes;
-            coreMoveTail = lastMoveTail;
-        }
-
-        featuresGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_FEATURES *>(features) + beginOffset * kernelIC);
-        indicesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices) + beginOffset * 4);
-        weightGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_WEIGHT *>(weight));
-
-        outputFeatureGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_FEATURES *>(feature_out) + beginOffset * kernelSize * kernelOC);
-        outputIndicesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices_out) + beginOffset * kernelSize);
-        outputIndicesPairGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices_pair) + beginOffset * kernelSize * 4);
-
-        pipe->InitBuffer(featuresQueue, BUFFER_NUM, AlignUp(kernelIC, valueBlockNum) * moveLen * sizeof(DTYPE_FEATURES));
-        pipe->InitBuffer(indicesQueue, BUFFER_NUM, moveLen * 4 * sizeof(DTYPE_INDICES));
-        pipe->InitBuffer(weightQueue, BUFFER_NUM, kernelOC * AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_WEIGHT));
-
-        pipe->InitBuffer(mulTmpUB, AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT));
-        pipe->InitBuffer(sumTmpUB, AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT));
-        pipe->InitBuffer(outTmpUB, AlignUp(kernelOC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT));
-
-        pipe->InitBuffer(outIndicesUB, moveLen * kernelSize * sizeof(DTYPE_INDICES));
-        pipe->InitBuffer(outIndicesPairUB, moveLen * kernelSize * 4 * sizeof(DTYPE_INDICES));
-        pipe->InitBuffer(workUB, workSize * sizeof(DTYPE_FEATURE_OUT));
-    }
-
-    __aicore__ inline void Process()
-    {
-        for (uint32_t i = 0; i < coreRepeatTimes; i++) {
-            Compute(i);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-private:
-
-    __aicore__ inline void initTilingData(SparseInverseConv3dTilingData *tiling_data)
-    {
-        usedCoreNum = tiling_data->usedCoreNum;
-        coreTask = tiling_data->coreTask;
-        lastCoreTask = tiling_data->lastCoreTask;
-
-        moveLen = tiling_data->moveLen;
-
-        repeatTimes = tiling_data->repeatTimes;
-        moveTail = tiling_data->moveTail;
-        lastRepeatTimes = tiling_data->lastRepeatTimes;
-        lastMoveTail = tiling_data->lastMoveTail;
-
-        kernelIC = tiling_data->kernelIC;
-        kernelOC = tiling_data->kernelOC;
-        kernelD = tiling_data-> kernelD;
-        kernelH = tiling_data->kernelH;
-        kernelW = tiling_data->kernelW;
-
-        outfeatureB = tiling_data->outfeatureB;
-        outputDepth = tiling_data->outputDepth;
-        outputHeight = tiling_data->outputHeight;
-        outputWidth = tiling_data->outputWidth;
-        kernelSize = tiling_data->kernelSize;
-
-        strideDepth = tiling_data->strideDepth;
-        strideHeight = tiling_data->strideHeight;
-        strideWidth = tiling_data->strideWidth;
-
-        paddingDepth = tiling_data->paddingDepth;
-        paddingHeight = tiling_data->paddingHeight;
-        paddingWidth = tiling_data->paddingWidth;
-
-        dilationDepth = tiling_data->dilationDepth;
-        dilationHeight = tiling_data->dilationHeight;
-        dilationWidth = tiling_data->dilationWidth;
-
-        outputPaddingDepth = tiling_data->outputPaddingDepth;
-        outputPaddingHeight = tiling_data->outputPaddingHeight;
-        outputPaddingWidth = tiling_data->outputPaddingWidth;
-    }
-
-    __aicore__ inline void Compute(uint32_t query)
-    {
-        uint32_t taskOffset = query * moveLen;
-        uint32_t forMoveLen = moveLen;
-        if (query == coreRepeatTimes - 1) {
-            forMoveLen = coreMoveTail;
-        }
-
-        DataCopyExtParams featureCopyParams {(uint16_t)forMoveLen, (uint32_t)(kernelIC * sizeof(DTYPE_WEIGHT)), 0, 0, 0};
-        DataCopyExtParams weightCopyParams {(uint16_t)kernelOC, (uint32_t)(kernelIC * sizeof(DTYPE_WEIGHT)), 0, 0, 0};
-        DataCopyExtParams indicesCopyParams {1, (uint32_t)(forMoveLen * 4 * sizeof(DTYPE_INDICES)), 0, 0, 0};
-
-        DataCopyExtParams outCopyParams {1, (uint32_t)(kernelOC * sizeof(DTYPE_WEIGHT)), 0, 0, 0};
-        DataCopyExtParams outIndicesCopyParams {1, (uint32_t)(forMoveLen * kernelSize * sizeof(DTYPE_INDICES)), 0, 0, 0};
-        DataCopyExtParams outPairCopyParams {1, (uint32_t)(forMoveLen * kernelSize * 4 * sizeof(DTYPE_INDICES)), 0, 0, 0};
-
-        DataCopyPadExtParams<DTYPE_WEIGHT> featurePadParams{true, 0, 0, 0};
-        DataCopyPadExtParams<DTYPE_WEIGHT> weightPadParams{true, 0, 0, 0};
-        DataCopyPadExtParams<DTYPE_INDICES> indicesPadParams{true, 0, 0, 0};
-
-        LocalTensor<DTYPE_FEATURES> featuresLocal = featuresQueue.AllocTensor<DTYPE_FEATURES>();
-        LocalTensor<DTYPE_INDICES> indicesLocal = indicesQueue.AllocTensor<DTYPE_INDICES>();
-        LocalTensor<DTYPE_WEIGHT> weightLocal = weightQueue.AllocTensor<DTYPE_WEIGHT>();
-
-        LocalTensor<DTYPE_FEATURE_OUT> mulTemp = mulTmpUB.Get<DTYPE_FEATURE_OUT>();
-        LocalTensor<DTYPE_FEATURE_OUT> sumTemp = sumTmpUB.Get<DTYPE_FEATURE_OUT>();
-        LocalTensor<DTYPE_FEATURE_OUT> workLocal = workUB.Get<DTYPE_FEATURE_OUT>();
-        LocalTensor<DTYPE_FEATURE_OUT> outTmpLocal = outTmpUB.Get<DTYPE_FEATURE_OUT>();
-
-        LocalTensor<DTYPE_INDICES> outIndicesTemp = outIndicesUB.Get<DTYPE_INDICES>();
-        LocalTensor<DTYPE_INDICES> outIndicesPairTemp = outIndicesPairUB.Get<DTYPE_INDICES>();
-
-        DTYPE_INDICES onesVal = -1;
-        Duplicate<DTYPE_INDICES>(outIndicesTemp, onesVal, moveLen * kernelSize);
-
-        event_t eventIDSToMTE2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE2));
-        event_t eventIDSToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3));
-        event_t eventIDMTE2ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V));
-        event_t eventIDMTE2ToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3));
-        event_t eventIDMTE2ToS = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));
-        event_t eventIDVToS = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_S));
-        event_t eventIDMTE3ToMTE2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
-
-        SetFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-        WaitFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-        DataCopyPad(indicesLocal, indicesGm[taskOffset * 4], indicesCopyParams, indicesPadParams);
-        pipe_barrier(PIPE_MTE2);
-
-        for (uint32_t i = 0; i < forMoveLen; i++) {
-            // GetValue feature's locations
-            int32_t idxOffset = i * 4;
-            int32_t featureB = indicesLocal.GetValue(idxOffset);
-            int32_t featureD = indicesLocal.GetValue(idxOffset + 1) * strideDepth - 2 * paddingDepth + outputPaddingDepth;
-            int32_t featureH = indicesLocal.GetValue(idxOffset + 2) * strideHeight - 2 * paddingHeight + outputPaddingHeight;
-            int32_t featureW = indicesLocal.GetValue(idxOffset + 3) * strideWidth - 2 * paddingWidth + outputPaddingWidth;
-            int32_t beginOutputOffset = featureB * outputDepth * outputHeight * outputWidth + featureD * outputHeight * outputWidth + featureH * outputWidth + featureW;
-            SetFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-            WaitFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-            DataCopyPad(featuresLocal, featuresGm[(taskOffset + i) * kernelIC], featureCopyParams, featurePadParams);
-
-            for (int32_t ix = 0; ix < kernelD; ix++) {
-                uint32_t xOffset = (uint32_t)ix * dilationDepth * outputHeight * outputWidth;
-                for (int32_t iy = 0; iy < kernelH; iy++) {
-                    uint32_t yOffset = (uint32_t)iy * dilationHeight * outputWidth;
-                    for (int32_t iz = 0; iz < kernelW; iz++) {
-                        uint32_t zOffset = (uint32_t)iy * dilationWidth;
-                        uint32_t gmOutValueOffset = beginOutputOffset + xOffset + yOffset + zOffset;
-                        uint32_t convOffset = kernelSize - 1 - (ix * kernelH * kernelW + iy * kernelW + iz);
-                        uint32_t weightOffset = convOffset * kernelIC * kernelOC;
-                        SetFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-                        WaitFlag<HardEvent::S_MTE2>(eventIDSToMTE2);
-                        DataCopyPad(weightLocal, weightGm[weightOffset], weightCopyParams, weightPadParams);
-                        SetFlag<HardEvent::MTE2_V>(eventIDMTE2ToV);
-                        WaitFlag<HardEvent::MTE2_V>(eventIDMTE2ToV);
-                        for (int32_t oc = 0; oc < kernelOC; oc++) {
-                            Mul(mulTemp, featuresLocal, weightLocal[oc * kernelICAlign], kernelICAlign);
-                            pipe_barrier(PIPE_V);
-                            ReduceSum<DTYPE_FEATURE_OUT>(sumTemp, mulTemp, workLocal, kernelICAlign);
-                            SetFlag<HardEvent::V_S>(eventIDVToS);
-                            WaitFlag<HardEvent::V_S>(eventIDVToS);
-                            outTmpLocal.SetValue(oc, sumTemp.GetValue(0));
-                        }
-                        int64_t outFeatureOffset = ((taskOffset + i) * kernelSize + convOffset) * kernelOC;
-                        int64_t outInidcesOffset = i * kernelSize + convOffset;
-                        int64_t outInidcesPairOffset = (i * kernelSize + convOffset) * 4;
-                        SetFlag<HardEvent::S_MTE3>(eventIDSToMTE3);
-                        WaitFlag<HardEvent::S_MTE3>(eventIDSToMTE3);
-                        DataCopyPad(outputFeatureGm[outFeatureOffset], outTmpLocal, outCopyParams);
-                        outIndicesTemp.SetValue(outInidcesOffset, gmOutValueOffset);
-                        outIndicesPairTemp.SetValue(outInidcesPairOffset, featureB);
-                        outIndicesPairTemp.SetValue(outInidcesPairOffset + 1, featureD + ix * dilationDepth);
-                        outIndicesPairTemp.SetValue(outInidcesPairOffset + 2, featureH + iy * dilationHeight);
-                        outIndicesPairTemp.SetValue(outInidcesPairOffset + 3, featureW + iy * dilationWidth);
-                    }
-                }
-            }
-            pipe_barrier(PIPE_ALL);
-        }
-        DataCopyPad(outputIndicesGm[taskOffset * kernelSize], outIndicesTemp, outIndicesCopyParams);
-        DataCopyPad(outputIndicesPairGm[taskOffset * kernelSize * 4], outIndicesPairTemp, outPairCopyParams);
-        featuresQueue.FreeTensor(featuresLocal);
-        indicesQueue.FreeTensor(indicesLocal);
-        weightQueue.FreeTensor(weightLocal);
-    }
-    __aicore__ inline void calculateReduceSum()
-    {
-        mulmask = maskAlign;
-        if (mulmask > kernelIC) {
-            mulmask = kernelIC;
-        }
-        mulRepeatTimes = AlignUp(kernelIC, mulmask);
-        workSize = AlignUp(mulRepeatTimes, dataAlign);
-    }
-    __aicore__ inline uint32_t Max(int32_t a, int32_t b)
-    {
-        if (a > b)  return a;
-        return b;
-    }
-    __aicore__ inline uint32_t Min(int32_t a, int32_t b)
-    {
-        if (a > b)  return b;
-        return a;
-    }
-
-private:
-// Private Member
-    TPipe *pipe;
-    GlobalTensor<DTYPE_FEATURES> featuresGm, weightGm, outputFeatureGm;
-    GlobalTensor<DTYPE_INDICES> indicesGm, outputIndicesGm, outputIndicesPairGm;
-
-    TQue<QuePosition::VECIN, 1> featuresQueue, indicesQueue, weightQueue;
-    TBuf<TPosition::VECCALC> mulTmpUB, sumTmpUB, workUB, outTmpUB, outIndicesUB, outIndicesPairUB;
-
-    uint32_t usedCoreNum;
-    uint32_t coreTask;
-    uint32_t lastCoreTask;
-
-    uint32_t moveLen;
-
-    uint32_t repeatTimes;
-    uint32_t moveTail;
-    uint32_t lastRepeatTimes;
-    uint32_t lastMoveTail;
-
-    uint32_t kernelOC;
-    uint32_t kernelIC;
-    uint32_t kernelD;
-    uint32_t kernelH;
-    uint32_t kernelW;
-    uint32_t kernelSize;
-
-    uint32_t outfeatureB;
-    uint32_t outputDepth;
-    uint32_t outputHeight;
-    uint32_t outputWidth;
-
-    uint32_t strideDepth;
-    uint32_t strideHeight;
-    uint32_t strideWidth;
-
-    uint32_t paddingDepth;
-    uint32_t paddingHeight;
-    uint32_t paddingWidth;
-
-    uint32_t dilationDepth;
-    uint32_t dilationHeight;
-    uint32_t dilationWidth;
-
-    uint32_t outputPaddingDepth;
-    uint32_t outputPaddingHeight;
-    uint32_t outputPaddingWidth;
-
-    uint32_t blockBytes{32};
-    uint32_t curBlockIdx;
-    uint32_t dataAlign;
-    uint32_t taskNum;
-    uint32_t coreRepeatTimes;
-    uint32_t coreMoveTail;
-    uint32_t kernelICAlign;
-    uint32_t maskAlign;
-    uint32_t mulmask;
-    uint32_t mulRepeatTimes;
-    uint32_t workSize;
-};
-extern "C" __global__ __aicore__ void sparse_inverse_conv3d(GM_ADDR features, GM_ADDR indices, GM_ADDR weight, GM_ADDR feature_out, GM_ADDR indices_out, GM_ADDR indices_pair, GM_ADDR workspace, GM_ADDR tiling) {
-    SetSysWorkspace(workspace);
-    GET_TILING_DATA(tiling_data, tiling);
-    TPipe pipe;
-    KernelSparseInverseConv3d op;
-    op.Init(features, indices, weight, feature_out, indices_out, indices_pair, workspace, &tiling_data, &pipe);
-    op.Process();
-}
diff --git a/mx_driving/_C/__init__.pyi b/mx_driving/_C/__init__.pyi
index 7876d7be..e27127bb 100644
--- a/mx_driving/_C/__init__.pyi
+++ b/mx_driving/_C/__init__.pyi
@@ -196,19 +196,6 @@ def npu_sparse_conv3d(
     outSpatialShape: Tuple[int, int, int],
     batch_size: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]: ...
-def npu_sparse_inverse_conv3d(
-    feature: torch.Tensor,
-    indices: torch.Tensor,
-    weight: torch.Tensor,
-    kernel_size: Tuple[int, int, int],
-    stride: Tuple[int, int, int],
-    padding: Tuple[int, int, int],
-    dilation: Tuple[int, int, int],
-    output_padding: Tuple[int, int, int],
-    out_channel: int,
-    outSpatialShape: Tuple[int, int, int],
-    batch_size: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ...
 def npu_sparse_conv3d_grad(
     indices_offset: torch.Tensor,
     former_sorted_indices: torch.Tensor,
diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py
index 213f811f..2030cce9 100644
--- a/mx_driving/__init__.py
+++ b/mx_driving/__init__.py
@@ -1,7 +1,6 @@
 __all__ = [
     "RoIPointPool3d",
     "SparseConv3d",
-    "SparseInverseConv3d",
     "SubMConv3d",
     "SparseConvTensor",
     "SparseModule",
@@ -66,7 +65,7 @@ import os
 import mx_driving._C
 
 from .modules.roi_point_pool_3d import RoIPointPool3d
-from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d
+from .modules.sparse_conv import SparseConv3d, SubMConv3d
 from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential
 from .modules.voxelization import Voxelization
 from .ops.assign_score_withk import assign_score_withk
diff --git a/mx_driving/csrc/SparseInverseConv3d.cpp b/mx_driving/csrc/SparseInverseConv3d.cpp
deleted file mode 100644
index e4ad5023..00000000
--- a/mx_driving/csrc/SparseInverseConv3d.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2024 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "csrc/OpApiCommon.h"
-#include "csrc/functions.h"
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> npu_sparse_inverse_conv3d(const at::Tensor& feature,
-    const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride,
-    at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel,
-    at::IntArrayRef outSpatialShape, int batch_size)
-{
-    // check Tensor Device is NPU
-    TORCH_CHECK_NPU(feature);
-    TORCH_CHECK_NPU(indices);
-    TORCH_CHECK_NPU(weight);
-
-    // Calculate kernelSize
-    int64_t kernelsum = 1;
-    for (int32_t i = 0; i < kernel_size.size(); i++) {
-        kernelsum *= kernel_size[i];
-    }
-    // to create memory of teh output
-    auto indices_size = indices.sizes();
-    int64_t outputsum = indices_size[0] * kernelsum;
-    c10::SmallVector<int64_t, 8> output_size = {outputsum, out_channel};
-    c10::SmallVector<int64_t, 8> indices_out_size = {outputsum};
-    c10::SmallVector<int64_t, 8> indices_pairs_size = {outputsum, indices_size[1]};
-
-    at::Tensor out = at::empty(output_size, feature.options());
-    at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1);
-    at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1);
-
-    c10::SmallVector<int64_t, 8> spatial_size = {
-        batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel};
-    at::IntArrayRef outputShape = at::IntArrayRef(spatial_size);
-    // weight [,,,in_channels, out_channels] -> [,,,out_channels, in_channels]
-    at::Tensor weight_trans = weight.transpose(-1, -2).contiguous();
-    EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape, stride, padding, dilation,
-        output_padding, out, indices_out, indices_pairs);
-    return std::tie(out, indices_pairs, indices_out);
-}
diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp
index 04bb3285..dc4f5b2e 100644
--- a/mx_driving/csrc/pybind.cpp
+++ b/mx_driving/csrc/pybind.cpp
@@ -184,9 +184,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     // npu_sparse_conv3d
     m.def("npu_sparse_conv3d", &npu_sparse_conv3d);
 
-    // npu_sparse_inverse_conv3d
-    m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d);
-
     // multi_to_sparse
     m.def("multi_to_sparse", &multi_to_sparse);
 
diff --git a/mx_driving/modules/sparse_conv.py b/mx_driving/modules/sparse_conv.py
index 62ee6b24..420ed789 100644
--- a/mx_driving/modules/sparse_conv.py
+++ b/mx_driving/modules/sparse_conv.py
@@ -186,29 +186,7 @@ class SparseConvolution(SparseModule):
     def forward(self, input_):
         if not isinstance(input_, SparseConvTensor):
             raise RuntimeError("input_ is not SparseConvTensor")
-        if self.inverse:
-            out_spatial_shape = get_inverse_conv_output_size(
-                input_.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding
-            )
-            out_spatial_shape = [int(i) for i in out_spatial_shape]
-            if not isinstance(out_spatial_shape, list):
-                out_spatial_shape = out_spatial_shape.tolist()
-            out_features, outidx = Fsp.indice_inverse_conv(
-                input_.features,
-                input_.indices,
-                self.weight,
-                out_spatial_shape,
-                self.out_channels,
-                input_.batch_size,
-                self.kernel_size,
-                self.stride,
-                self.padding,
-                self.dilation,
-                self.output_padding,
-                self.groups,
-                self.bias,
-            )
-        elif not self.subm:
+        if not self.subm:
             out_spatial_shape = get_conv_output_size(
                 input_.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation
             )
@@ -230,7 +208,7 @@ class SparseConvolution(SparseModule):
                 self.bias,
             )
         else:
-            
+
             out_spatial_shape = input_.spatial_shape
             out_spatial_shape = [int(i) for i in out_spatial_shape]
             if not isinstance(out_spatial_shape, list):
@@ -335,34 +313,3 @@ class SubMConv3d(SparseConvolution):
             mode=mode,
         )
 
-
-class SparseInverseConv3d(SparseConvolution):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        dilation=1,
-        groups=1,
-        bias=True,
-        inverse=True,
-        indice_key=None,
-        mode="mmcv",
-    ):
-        super().__init__(
-            3,
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            groups,
-            bias,
-            subm=False,
-            inverse=True,
-            indice_key=indice_key,
-            mode=mode,
-        )
diff --git a/mx_driving/ops/sparse_functional.py b/mx_driving/ops/sparse_functional.py
index b7018ed8..5a1906e5 100644
--- a/mx_driving/ops/sparse_functional.py
+++ b/mx_driving/ops/sparse_functional.py
@@ -76,69 +76,6 @@ class SparseConvFunction(Function):
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None
 
 
-class SparseInverseConvFunction(Function):
-
-    @staticmethod
-    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(
-        ctx: Any,
-        features,
-        indices,
-        weight,
-        out_spatial_shape,
-        out_channels,
-        batch_size,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        output_padding,
-        groups,
-        bias,
-    ) -> torch.Tensor:
-        device = features.device
-        weight = weight.data
-        # calculate the index pair
-        out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d(
-            features,
-            indices,
-            weight,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            output_padding,
-            out_channels,
-            out_spatial_shape,
-            batch_size,
-        )
-        # sort and nonezero
-        to_insert = torch.tensor(-1).to(device)
-        sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32))
-        new_sorted_idx = torch.cat((to_insert.view(1), sorted_idx.view(torch.int32)), 0)
-        new_sorted_idx_2 = torch.cat((sorted_idx.view(torch.int32), to_insert.view(1)), 0)
-        sub_result = new_sorted_idx - new_sorted_idx_2
-        unique_indices_offset = torch.nonzero(sub_result != 0)
-        # matmul
-        out_features, outidx = mx_driving._C.multi_to_sparse(
-            out_features, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int()
-        )
-        outidx, outidx_ = torch.chunk(outidx, 2, dim=1)
-
-        ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int())
-        return out_features, outidx
-
-    @staticmethod
-    @once_differentiable
-    # pylint: disable=too-many-return-values
-    def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple:
-        features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors
-        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(
-            unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features
-        )
-        return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None
-
-
 class SubMConvFunction(Function):
 
     @staticmethod
@@ -199,8 +136,8 @@ class SubMConvFunction(Function):
         weight = weight.permute(0, 1, 2, 4, 3).contiguous()
         weight_permute = weight.view(kernel_num*weight_shape[4], weight_shape[3])
         feature_grad = grad_out_features_iml2col @ weight_permute
-        
-        
+
+
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None
 
 
@@ -254,11 +191,10 @@ class SubMConvWithKeyFunction(Function):
         weight = weight.permute(0, 1, 2, 4, 3).contiguous()
         weight_permute = weight.view(kernel_num*weight_shape[4], weight_shape[3])
         feature_grad = grad_out_features_iml2col @ weight_permute
-        
-        
+
+
         return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None
 
 indice_conv = SparseConvFunction.apply
-indice_inverse_conv = SparseInverseConvFunction.apply
 indice_subm_conv = SubMConvFunction.apply
-indice_subm_conv_with_key = SubMConvWithKeyFunction.apply
\ No newline at end of file
+indice_subm_conv_with_key = SubMConvWithKeyFunction.apply
diff --git a/mx_driving/spconv.py b/mx_driving/spconv.py
index c9e9ba98..15df8265 100644
--- a/mx_driving/spconv.py
+++ b/mx_driving/spconv.py
@@ -1,6 +1,6 @@
 import warnings
 
-from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d
+from .modules.sparse_conv import SparseConv3d, SubMConv3d
 from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential
 
 warnings.warn(
diff --git a/tests/torch/test_sparse_inverse_conv3d.py b/tests/torch/test_sparse_inverse_conv3d.py
deleted file mode 100644
index 95bcaca0..00000000
--- a/tests/torch/test_sparse_inverse_conv3d.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2024, Huawei Technologies.All rights reserved.
-
-"""Compare results between different algos:
-CPU: simple gather-mm-scatter
-Native: Fused gather-mm-scatter
-ImplicitGemm: implicit gemm
-"""
-
-import time
-from pathlib import Path
-
-import numpy as np
-import torch
-import torch_npu
-from torch import nn
-from mx_driving.spconv import SparseSequential, SparseConvTensor, SparseInverseConv3d
-
-
-def generate_sparse_data(shape,
-                        num_points,
-                        num_channels,
-                        integer=False,
-                        data_range=(-1, 1),
-                        with_dense=True,
-                        dtype=np.float32,
-                        shape_scale=1):
-    dense_shape = shape
-    ndim = len(dense_shape)
-    # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-    num_points = np.array(num_points)
-    # num_points = np.array([3, 2])
-    batch_size = len(num_points)
-    batch_indices = []
-    coors_total = np.stack(np.meshgrid(*[np.arange(0, s // shape_scale) for s in shape]),
-                           axis=-1)
-    coors_total = coors_total.reshape(-1, ndim) * shape_scale
-    for i in range(batch_size):
-        np.random.shuffle(coors_total)
-        inds_total = coors_total[:num_points[i]]
-        inds_total = np.pad(inds_total, ((0, 0), (0, 1)),
-                            mode="constant",
-                            constant_values=i)
-        batch_indices.append(inds_total)
-    if integer:
-        sparse_data = np.random.randint(data_range[0],
-                                        data_range[1],
-                                        size=[num_points.sum(),
-                                              num_channels]).astype(dtype)
-    else:
-        sparse_data = np.random.uniform(data_range[0],
-                                        data_range[1],
-                                        size=[num_points.sum(),
-                                              num_channels]).astype(dtype)
-
-    # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
-
-    res = {
-        "features": sparse_data.astype(dtype),
-    }
-    if with_dense:
-        dense_data = np.zeros([batch_size, num_channels, *dense_shape],
-                              dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        res["features_dense"] = dense_data.astype(dtype)
-    batch_indices = np.concatenate(batch_indices, axis=0)
-    res["indices"] = batch_indices.astype(np.int32)
-    return res
-
-
-class Net(nn.Module):
-    def __init__(self, shape):
-        super().__init__()
-        self.net = SparseSequential(
-            SparseInverseConv3d(16, 32, 3)
-        )
-        max_batch_size = 1
-        self.shape = shape
-
-    def forward(self, features, coors, batch_size):
-        x = SparseConvTensor(features,
-                            coors,
-                            self.shape,
-                            batch_size)
-        return self.net(x)
-
-
-def _test_multi_impl(spatial_shape, feature_num, dtype: torch.dtype):
-
-    np.random.seed(50051)
-
-    spatial_shape = [4, 4, 4]
-    sparse_dict = generate_sparse_data(spatial_shape, [feature_num] * 1, 16)
-
-    voxels = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-    coors = np.ascontiguousarray(
-        sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-    device = torch.device("npu:0")
-
-    voxels_th_npu = torch.from_numpy(voxels).to(device).to(dtype)
-
-    coors_th_npu = torch.from_numpy(coors).to(device)
-    net_cls = Net
-    # npu
-    torch.manual_seed(50051)
-    net_native_npu = net_cls(spatial_shape).to(device).to(dtype)
-
-    out = net_native_npu(voxels_th_npu, coors_th_npu, 1)
-
-
-def test_multi_impl():
-    _test_multi_impl([4, 4, 4], 3, torch.float32)
-    _test_multi_impl([7, 7, 7], 9, torch.float32)
-    _test_multi_impl([12, 13, 14], 100, torch.float32)
-    _test_multi_impl([25, 25, 25], 400, torch.float32)
-
-
-# if __name__ == "__main__":
-#     test_multi_impl()
\ No newline at end of file
-- 
Gitee