diff --git a/kernels/op_kernel/sparse_conv3d_grad_v2.cpp b/kernels/op_kernel/sparse_conv3d_grad_v2.cpp
index 7cf0d03f6a3ddd1ecc8027a3180c8fc53c9ba5ca..5011cab915cf2933a80e5ec8dc0e910ff61b5373 100644
--- a/kernels/op_kernel/sparse_conv3d_grad_v2.cpp
+++ b/kernels/op_kernel/sparse_conv3d_grad_v2.cpp
@@ -21,7 +21,10 @@ public:
         kernelSizeAlign = AlignUp(kernelSize, idxBlockNum);
         kernelICAlign = AlignUp(kernelIC, valueBlockNum);
         uint64_t beginOffset = curBlockIdx * vectorCoreTask;
-        uint64_t initLen = Ceiling(featureCubeTilingData.M, usedVectorCoreNum);
+        if (usedVectorCoreNum <= 0) {
+            return ;
+        }
+        uint64_t initLen = featureCubeTilingData.M / usedVectorCoreNum;
 
         indicesOffsetGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES_OFFSET *>(indices_offset) + beginOffset);
         formerSortedIndicesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES_OFFSET *>(former_sorted_indices));
@@ -43,7 +46,13 @@ public:
         // workspace Init
         featureInitGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_FEATURE *>(workspace) + initLen * curBlockIdx * kernelSize * kernelOC);
         const float zeros = 0.0;
-        InitGlobalMemory(featureInitGm, initLen * kernelSize * kernelOC, zeros);
+        if (curBlockIdx < usedVectorCoreNum) {
+            if (usedVectorCoreNum - 1 != curBlockIdx) {
+                InitGlobalMemory(featureInitGm, initLen * kernelSize * kernelOC, zeros);
+            } else {
+                InitGlobalMemory(featureInitGm, (featureCubeTilingData.M - initLen * curBlockIdx) * kernelSize * kernelOC, zeros);
+            }
+        }
         SyncAll();
 
         uint64_t featureSingleCoreM, featureSingleCoreN, featureSingleCoreK;
diff --git a/tests/torch/test_sparse_conv_grad.py b/tests/torch/test_sparse_conv_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b664759ea66a6c250d4d411f96ab449e2a705a8
--- /dev/null
+++ b/tests/torch/test_sparse_conv_grad.py
@@ -0,0 +1,135 @@
+from typing import Tuple, Optional
+import torch
+import torch_npu
+from data_cache import golden_data_cache
+
+import mx_driving
+
+
+# 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+@golden_data_cache(__file__)
+def generate_input_data(
+    seed: int = 0,
+    device: str = "npu",
+    min_val: int = 1485,
+    max_val: int = 2481,
+    num_samples: int = 425,
+    features_shape: Tuple[int, int] = (827, 128),
+    weight_shape: Tuple[int, int, int, int, int] = (3, 1, 1, 128, 128),
+    grad_out_shape: Tuple[int, int] = (424, 128)
+) -> Tuple[torch.Tensor, ...]:
+    """
+    生成可重复的随机输入数据
+    
+    参数:
+    - seed: 随机种子 (默认0)
+    - device: 目标设备 (默认'npu')
+    - min_val/max_val: 索引范围 (包含端点)
+    - num_samples: 采样数量
+    - features_shape: 特征矩阵形状
+    - weight_shape: 权重张量形状
+    - grad_out_shape: 梯度输出形状
+    
+    返回:
+    Tuple containing:
+    - unique_indices_offset
+    - sorted_idx_to_former_indices
+    - features
+    - weight
+    - grad_out_features
+    """
+    # 参数校验
+    assert max_val > min_val, "max_val 必须大于 min_val"
+    total_numbers = max_val - min_val + 1
+    assert num_samples <= total_numbers, f"采样数量不能超过 {total_numbers}"
+    
+    # 固定随机种子
+    torch.manual_seed(seed)
+    
+    # 生成设备映射函数
+    def to_device(tensor):
+        return tensor.to(device) if device else tensor
+    
+    # 生成唯一随机索引
+    random_indices = torch.sort(
+        torch.randperm(total_numbers, generator=torch.Generator().manual_seed(seed))[:num_samples]
+    )[0]
+    
+    # 映射到目标范围
+    unique_indices_offset = to_device(
+        (random_indices + min_val).to(torch.int32).reshape(-1, 1)
+    )
+    
+    # 生成排序后的随机索引
+    sorted_idx_to_former_indices = to_device(
+        torch.sort(torch.randint(
+            low=1,
+            high=max_val,
+            size=(max_val,),
+            generator=torch.Generator().manual_seed(seed),
+            dtype=torch.int32
+        ))[0]
+    )
+    
+    # 生成特征矩阵
+    features = to_device(
+        torch.rand(*features_shape, generator=torch.Generator().manual_seed(seed)) * 10 - 5
+    )
+    
+    # 生成权重张量
+    weight = to_device(
+        torch.rand(*weight_shape, generator=torch.Generator().manual_seed(seed)) * 2 - 1
+    )
+    
+    # 生成梯度输出
+    grad_out_features = to_device(
+        torch.rand(*grad_out_shape, generator=torch.Generator().manual_seed(seed)) * 2 - 1
+    )
+    
+    return (
+        unique_indices_offset,
+        sorted_idx_to_former_indices,
+        features,
+        weight,
+        grad_out_features
+    )
+
+
+if __name__ == "__main__":
+    # 定义测试用例列表
+    test_cases = [
+        # 第一个测试用例（原自定义参数）
+        {
+            "seed": 42,
+            "device": "npu",
+            "min_val": 1485,
+            "max_val": 2481,
+            "num_samples": 425,
+            "features_shape": (827, 128),
+            "grad_out_shape": (424, 128),
+            "weight_shape": (3, 1, 1, 128, 128)
+        },
+        # 第二个测试用例（大数值参数）
+        {
+            "seed": 42,
+            "device": "npu",
+            "min_val": 4508310,
+            "max_val": 4516128,
+            "num_samples": 1831,
+            "features_shape": (167264, 16),
+            "grad_out_shape": (1830, 32),
+            "weight_shape": (3, 3, 3, 16, 32)
+        }
+    ]
+
+    # 遍历执行所有测试用例
+    for _, case_params in enumerate(test_cases, 1):
+        
+        # 生成输入数据
+        inputs = generate_input_data(**case_params)
+        unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features = inputs
+        
+        # 调用目标函数
+        feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad(
+            unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features
+        )
\ No newline at end of file