From eccf8dda7147b187b56675ee0bb29e279dc3ece4 Mon Sep 17 00:00:00 2001 From: ZYF-Annarine Date: Sat, 1 Mar 2025 16:09:23 +0800 Subject: [PATCH] delete SparseInverConv3d --- docs/api/README.md | 1 - docs/api/context/SparseInverseConv3d[beta].md | 60 ---- include/csrc/functions.h | 5 - kernels/op_host/sparse_inverse_conv3d.cpp | 218 ------------- .../op_host/sparse_inverse_conv3d_tiling.h | 80 ----- kernels/op_kernel/sparse_inverse_conv3d.cpp | 304 ------------------ mx_driving/_C/__init__.pyi | 13 - mx_driving/__init__.py | 3 +- mx_driving/csrc/SparseInverseConv3d.cpp | 54 ---- mx_driving/csrc/pybind.cpp | 3 - mx_driving/modules/sparse_conv.py | 57 +--- mx_driving/ops/sparse_functional.py | 74 +---- mx_driving/spconv.py | 2 +- tests/torch/test_sparse_inverse_conv3d.py | 123 ------- 14 files changed, 9 insertions(+), 988 deletions(-) delete mode 100644 docs/api/context/SparseInverseConv3d[beta].md delete mode 100644 kernels/op_host/sparse_inverse_conv3d.cpp delete mode 100644 kernels/op_host/sparse_inverse_conv3d_tiling.h delete mode 100644 kernels/op_kernel/sparse_inverse_conv3d.cpp delete mode 100644 mx_driving/csrc/SparseInverseConv3d.cpp delete mode 100644 tests/torch/test_sparse_inverse_conv3d.py diff --git a/docs/api/README.md b/docs/api/README.md index 16679c8e..3d1fc814 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -41,7 +41,6 @@ - 稀疏 - [SparseConv3d](./context/SparseConv3d.md) - [SubmSparseConv3d](./context/SubMConv3d.md) - - [SparseInverseConv3d[beta]](./context/SparseInverseConv3d[beta].md) - 融合 - [multi_scale_deformable_attn](./context/multi_scale_deformable_attn.md) - [deformable_aggregation](./context/deformable_aggregation.md) diff --git a/docs/api/context/SparseInverseConv3d[beta].md b/docs/api/context/SparseInverseConv3d[beta].md deleted file mode 100644 index e32c7e6b..00000000 --- a/docs/api/context/SparseInverseConv3d[beta].md +++ /dev/null @@ -1,60 +0,0 @@ -## SparseInverseConv3d(beta) -### 接口原型 -```python -mx_driving.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor -``` -兼容 -```python -mx_driving.spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, output_padding=0,bias=True, indice_key=None, mode='mmcv') -> SparseConvTensor -``` -### 功能描述 -稀疏逆卷积 -### 参数说明 -- `in_channels(int)`:输入数据的通道数 -- `out_channels(int)`:输出通道数 -- `kernel_size(List(int)/Tuple(int)/int)`:卷积神经网络中卷积核的大小 -- `stride(List(int)/Tuple(int)/int)`:卷积核在输入数据上滑动时的步长 -- `dilation(List(int)/Tuple(int)/int)`:空洞卷积大小 -- `groups(int)`:分组卷积 -- `bias(bool)`:偏置项 -- `indice_key(str)`:该输入用于复用之前计算的索引信息 -- `mode(str)`:区分了`mmcv`和`spconv`两种不同框架下的稀疏卷积 -### 返回值 -- `SparseConvTensor`:存储了输出的特征值`out_feature`,对应索引位置`out_indices`和对应的spatital_shape。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 约束说明 -- `kernel_size`当前支持数据类型为三维List/Tuple或Int,值域为`[1, 3]` -- `stride`当前支持数据类型为三维List/Tuple或Int -- `dilation`,`groups`当前仅支持值为1 -- 对于反向也是同样的约束。 -### 调用示例 -```python -import torch,torch_npu -import numpy as np -from mx_driving import SparseInverseConv3d, SparseConvTensor - -def generate_indice(batch, height, width, depth, actual_num): - base_indices = np.random.permutation(np.arange(batch * height * width * depth))[:actual_num] - base_indices = np.sort(base_indices) - b_indice = base_indices // (height * width * depth) - base_indices = base_indices % (height * width * depth) - h_indice = base_indices // (width * depth) - base_indices = base_indices % (width * depth) - w_indice = base_indices // depth - d_indice = base_indices % depth - indices = np.concatenate((b_indice, h_indice, w_indice, d_indice)).reshape(4, actual_num) - return indices - -actual_num = 20 -batch = 4 -spatial_shape = [9, 9, 9] -indices = torch.from_numpy(generate_indice(batch, spatial_shape[0], spatial_shape[1], spatial_shape[2], actual_num)).int().transpose(0, 1).contiguous().npu() -feature = tensor_uniform = torch.rand(actual_num, 16).npu() -feature.requires_grad = True -x = SparseConvTensor(feature, indices, spatial_shape, batch) -net = SparseInverseConv3d(in_channels=16, out_channels=32, kernel_size=3).npu() -out = net(x) -dout = torch.ones_like(out.features).float().npu() -out.features.backward(dout) -``` \ No newline at end of file diff --git a/include/csrc/functions.h b/include/csrc/functions.h index 5c8e1843..4963c8b4 100644 --- a/include/csrc/functions.h +++ b/include/csrc/functions.h @@ -177,11 +177,6 @@ std::tuple multi_to_sparse_v2(const at::Tensor& features std::tuple npu_sparse_conv3d(const at::Tensor& indices, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, int out_channel, at::IntArrayRef outSpatialShape, int batch_size); -std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, - const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, - at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel, - at::IntArrayRef outSpatialShape, int batch_size); - std::tuple npu_sparse_conv3d_grad(const at::Tensor& indices_offset, const at::Tensor& former_sorted_indices, const at::Tensor& feature, const at::Tensor& weight, const at::Tensor& grad); diff --git a/kernels/op_host/sparse_inverse_conv3d.cpp b/kernels/op_host/sparse_inverse_conv3d.cpp deleted file mode 100644 index d7be9391..00000000 --- a/kernels/op_host/sparse_inverse_conv3d.cpp +++ /dev/null @@ -1,218 +0,0 @@ - -#include "sparse_inverse_conv3d_tiling.h" -#include "register/op_def_registry.h" -#include "tiling/tiling_api.h" -#include "tiling/platform/platform_ascendc.h" -using namespace ge; - -namespace optiling { -constexpr uint32_t BYTE_BLOCK = 32; -constexpr uint32_t DTYPE_FP32_BLOCK = 8; -constexpr uint32_t RESERVED_UB_SIZE = 16 * 1024; -constexpr uint32_t OTHER_UB_NUMBER = 256; - -void SparseInverseConv3dTiling::CalUsedCoreNumAndCoreTask() -{ - coreTask = ((actualNum + coreNum - 1) / coreNum / BYTE_BLOCK) * BYTE_BLOCK; - usedCoreNum = (actualNum + coreTask - 1) / coreTask; - lastCoreTask = (actualNum % coreTask == 0) ? coreTask : actualNum % coreTask; -} - -void SparseInverseConv3dTiling::CalAvailableUbTiling() -{ - // featureUb [moveLen, icAlignUp], indicesUb [moveLen, 4], outidxUb [moveLen, 27] outidxPairUb [moveLen, kernelSize, 4] - // otherUB The max Value of ic and oc is 256 -> mulUb + sumUb + tmpUb + workUb = 256 * 4 * sizeof(int32 or float32) - // weight shape is [..., oc, ic] -> weightUb = oc * ic - uint64_t availableUbSize = ubSizePlatForm - RESERVED_UB_SIZE; - uint64_t ubAvailableNumber = availableUbSize / sizeof(float); - ubAvailableNumber -= OTHER_UB_NUMBER * 4; - ubAvailableNumber -= kernelIC * (kernelIC - DTYPE_FP32_BLOCK + 1) / DTYPE_FP32_BLOCK ; - uint32_t partNum = (kernelIC - DTYPE_FP32_BLOCK + 1) / DTYPE_FP32_BLOCK + 4 + 5 * kernelSize; - moveLen = ubAvailableNumber / partNum; - moveLen = moveLen / BYTE_BLOCK * BYTE_BLOCK; -} - -void SparseInverseConv3dTiling::GetIntArrayList() -{ - auto attrsPtr = tilingContext->GetAttrs(); - auto outSpatialShapePtr = attrsPtr->GetAttrPointer(0); - auto stridePtr = attrsPtr->GetAttrPointer(1); - auto paddingPtr = attrsPtr->GetAttrPointer(2); - auto dilationPtr = attrsPtr->GetAttrPointer(3); - auto outputPaddingPtr = attrsPtr->GetAttrPointer(4); - auto outSpatialShapeData = reinterpret_cast(outSpatialShapePtr->GetData()); - auto strideData = reinterpret_cast(stridePtr->GetData()); - auto paddingData = reinterpret_cast(paddingPtr->GetData()); - auto dilationData = reinterpret_cast(paddingPtr->GetData()); - auto outputPaddingData = reinterpret_cast(paddingPtr->GetData()); - tilingData.set_outfeatureB(outSpatialShapeData[0]); - tilingData.set_outputDepth(outSpatialShapeData[1]); - tilingData.set_outputHeight(outSpatialShapeData[2]); - tilingData.set_outputWidth(outSpatialShapeData[3]); - tilingData.set_strideDepth(strideData[0]); - tilingData.set_strideHeight(strideData[1]); - tilingData.set_strideWidth(strideData[2]); - tilingData.set_paddingDepth(paddingData[0]); - tilingData.set_paddingHeight(paddingData[1]); - tilingData.set_paddingWidth(paddingData[2]); - tilingData.set_dilationDepth(dilationData[0]); - tilingData.set_dilationHeight(dilationData[1]); - tilingData.set_dilationWidth(dilationData[2]); - tilingData.set_outputPaddingDepth(outputPaddingData[0]); - tilingData.set_outputPaddingHeight(outputPaddingData[1]); - tilingData.set_outputPaddingWidth(outputPaddingData[2]); -} - -ge::graphStatus SparseInverseConv3dTiling::Init() -{ - auto platformInfo = tilingContext->GetPlatformInfo(); - auto attrsPtr = tilingContext->GetAttrs(); - if (platformInfo == nullptr || attrsPtr == nullptr) { - return ge::GRAPH_FAILED; - } - auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo); - coreNum = ascendcPlatform.GetCoreNumAiv(); - ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSizePlatForm); - if (coreNum == 0) { - return ge::GRAPH_FAILED; - } - if (tilingContext->GetInputShape(0) == nullptr || tilingContext->GetInputShape(2) == nullptr) { - return ge::GRAPH_FAILED; - } - auto feature_shape = tilingContext->GetInputShape(0)->GetStorageShape(); - auto weight_shape = tilingContext->GetInputShape(2)->GetStorageShape(); - actualNum = feature_shape.GetDim(0); - CalUsedCoreNumAndCoreTask(); - kernelD = weight_shape.GetDim(0); - kernelH = weight_shape.GetDim(1); - kernelW = weight_shape.GetDim(2); - kernelOC = weight_shape.GetDim(3); - kernelIC = weight_shape.GetDim(4); - kernelSize = kernelD * kernelH * kernelW; - CalAvailableUbTiling(); - GetIntArrayList(); -} - -ge::graphStatus SparseInverseConv3dTiling::RunKernelTiling() -{ - tilingContext->SetBlockDim(usedCoreNum); - tilingData.set_usedCoreNum(usedCoreNum); - tilingData.set_coreTask(coreTask); - tilingData.set_lastCoreTask(lastCoreTask); - tilingData.set_moveLen(moveLen); - tilingData.set_repeatTimes(repeatTimes); - tilingData.set_moveTail(moveTail); - tilingData.set_lastRepeatTimes(lastRepeatTimes); - tilingData.set_lastMoveTail(lastMoveTail); - tilingData.set_kernelD(kernelD); - tilingData.set_kernelH(kernelH); - tilingData.set_kernelW(kernelW); - tilingData.set_kernelIC(kernelIC); - tilingData.set_kernelOC(kernelOC); - tilingData.set_kernelSize(kernelSize); - if (tilingContext->GetRawTilingData() == nullptr) { - return ge::GRAPH_FAILED; - } - auto ascendcPlatform = platform_ascendc::PlatformAscendC(tilingContext->GetPlatformInfo()); - tilingData.SaveToBuffer(tilingContext->GetRawTilingData()->GetData(), tilingContext->GetRawTilingData()->GetCapacity()); - tilingContext->GetRawTilingData()->SetDataSize(tilingData.GetDataSize()); - uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); - size_t* currentWorkspace = tilingContext->GetWorkspaceSizes(1); - currentWorkspace[0] = sysWorkspaceSize; - return ge::GRAPH_SUCCESS; -} - -ge::graphStatus TilingForSparseInverseConv3d(gert::TilingContext* context) -{ - if (context == nullptr) { - return ge::GRAPH_FAILED; - } - SparseInverseConv3dTiling tilingObject(context); - tilingObject.Init(); - return tilingObject.RunKernelTiling(); -} -} // namespace optiling - -namespace ge { -static ge::graphStatus InferShape(gert::InferShapeContext* context) -{ - const gert::Shape* featureShape = context->GetInputShape(0); - const gert::Shape* indicesShape = context->GetInputShape(1); - const gert::Shape* weightShape = context->GetInputShape(2); - if (featureShape == nullptr || indicesShape == nullptr || weightShape == nullptr) { - return ge::GRAPH_FAILED; - } - gert::Shape* outShape = context->GetOutputShape(0); - gert::Shape* indicesOutShape = context->GetOutputShape(1); - gert::Shape* indicesPairShape = context->GetOutputShape(2); - if (outShape == nullptr || indicesOutShape == nullptr || indicesPairShape == nullptr) { - return ge::GRAPH_FAILED; - } - uint64_t kernelSize = 1; - for (size_t i = 0; i < weightShape->GetDimNum() - 2; i++) { - kernelSize *= weightShape->GetDim(i); - } - uint64_t kernelOC = weightShape->GetDim(3); - uint64_t indicesSecondSize = indicesShape->GetDim(1); - *outShape = {kernelSize, kernelOC}; - *indicesOutShape = {kernelSize}; - *indicesPairShape = {kernelSize, indicesSecondSize}; - return GRAPH_SUCCESS; -} -} - -namespace ops { -class SparseInverseConv3d : public OpDef { -public: - explicit SparseInverseConv3d(const char* name) : OpDef(name) - { - this->Input("features") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) - .AutoContiguous(); - this->Input("indices") - .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) - .AutoContiguous(); - this->Input("weight") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) - .AutoContiguous(); - - this->Output("feature_out") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Output("indices_out") - .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Output("indices_pair") - .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - - this->Attr("out_spatial_shape").ListInt(); - this->Attr("stride").ListInt(); - this->Attr("padding").ListInt(); - this->Attr("dilation").ListInt(); - this->Attr("output_padding").ListInt(); - - this->SetInferShape(ge::InferShape); - this->AICore().SetTiling(optiling::TilingForSparseInverseConv3d); - this->AICore().AddConfig("ascend910b"); - this->AICore().AddConfig("ascend910_93"); - } -}; - -OP_ADD(SparseInverseConv3d); -} \ No newline at end of file diff --git a/kernels/op_host/sparse_inverse_conv3d_tiling.h b/kernels/op_host/sparse_inverse_conv3d_tiling.h deleted file mode 100644 index c6165235..00000000 --- a/kernels/op_host/sparse_inverse_conv3d_tiling.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. - */ -#ifndef SPARSE_INVERSE_CONV3D_TILING_H -#define SPARSE_INVERSE_CONV3D_TILING_H - -#include "register/op_def_registry.h" -#include "register/tilingdata_base.h" -#include "tiling/tiling_api.h" - -namespace optiling { -BEGIN_TILING_DATA_DEF(SparseInverseConv3dTilingData) - TILING_DATA_FIELD_DEF(uint32_t, usedCoreNum) - TILING_DATA_FIELD_DEF(uint32_t, coreTask) - TILING_DATA_FIELD_DEF(uint32_t, lastCoreTask) - TILING_DATA_FIELD_DEF(uint32_t, moveLen) - TILING_DATA_FIELD_DEF(uint32_t, repeatTimes) - TILING_DATA_FIELD_DEF(uint32_t, moveTail) - TILING_DATA_FIELD_DEF(uint32_t, lastRepeatTimes) - TILING_DATA_FIELD_DEF(uint32_t, lastMoveTail) - TILING_DATA_FIELD_DEF(uint32_t, kernelD) - TILING_DATA_FIELD_DEF(uint32_t, kernelH) - TILING_DATA_FIELD_DEF(uint32_t, kernelW) - TILING_DATA_FIELD_DEF(uint32_t, kernelIC) - TILING_DATA_FIELD_DEF(uint32_t, kernelOC) - TILING_DATA_FIELD_DEF(uint32_t, kernelSize) - TILING_DATA_FIELD_DEF(uint32_t, outfeatureB) - TILING_DATA_FIELD_DEF(uint32_t, outputDepth) - TILING_DATA_FIELD_DEF(uint32_t, outputHeight) - TILING_DATA_FIELD_DEF(uint32_t, outputWidth) - TILING_DATA_FIELD_DEF(uint32_t, strideDepth) - TILING_DATA_FIELD_DEF(uint32_t, strideHeight) - TILING_DATA_FIELD_DEF(uint32_t, strideWidth) - TILING_DATA_FIELD_DEF(uint32_t, paddingDepth) - TILING_DATA_FIELD_DEF(uint32_t, paddingHeight) - TILING_DATA_FIELD_DEF(uint32_t, paddingWidth) - TILING_DATA_FIELD_DEF(uint32_t, dilationDepth) - TILING_DATA_FIELD_DEF(uint32_t, dilationHeight) - TILING_DATA_FIELD_DEF(uint32_t, dilationWidth) - TILING_DATA_FIELD_DEF(uint32_t, outputPaddingDepth) - TILING_DATA_FIELD_DEF(uint32_t, outputPaddingHeight) - TILING_DATA_FIELD_DEF(uint32_t, outputPaddingWidth) -END_TILING_DATA_DEF; - -REGISTER_TILING_DATA_CLASS(SparseInverseConv3d, SparseInverseConv3dTilingData) - -class SparseInverseConv3dTiling { -public: - explicit SparseInverseConv3dTiling(gert::TilingContext* context) : tilingContext(context) {}; - ge::graphStatus Init(); - ge::graphStatus RunKernelTiling(); - -private: - void CalUsedCoreNumAndCoreTask(); - void CalAvailableUbTiling(); - void GetIntArrayList(); - -private: - SparseInverseConv3dTilingData tilingData; - gert::TilingContext* tilingContext = nullptr; - uint32_t coreNum; - uint32_t usedCoreNum; - uint32_t coreTask; - uint32_t lastCoreTask; - uint32_t actualNum; - uint32_t kernelD; - uint32_t kernelH; - uint32_t kernelW; - uint32_t kernelOC; - uint32_t kernelIC; - uint32_t kernelSize; - uint64_t ubSizePlatForm; - uint32_t moveLen; - uint32_t repeatTimes; - uint32_t moveTail; - uint32_t lastRepeatTimes; - uint32_t lastMoveTail; -}; -} // namespace optiling -#endif // SPARSE_INVERSE_CONV3D_TILING_H diff --git a/kernels/op_kernel/sparse_inverse_conv3d.cpp b/kernels/op_kernel/sparse_inverse_conv3d.cpp deleted file mode 100644 index 07d1e681..00000000 --- a/kernels/op_kernel/sparse_inverse_conv3d.cpp +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. - */ - -#include "kernel_operator.h" -using namespace AscendC; - -namespace { -constexpr static int32_t BUFFER_NUM = 1; -}; - -class KernelSparseInverseConv3d { -public: - __aicore__ inline KernelSparseInverseConv3d() {} - __aicore__ inline void Init(GM_ADDR features, GM_ADDR indices, GM_ADDR weight, GM_ADDR feature_out, GM_ADDR indices_out, GM_ADDR indices_pair, GM_ADDR workspace, SparseInverseConv3dTilingData *tiling_data, TPipe *tmpPipe) - { - pipe = tmpPipe; - curBlockIdx = GetBlockIdx(); - // features dtype must be same with weight - dataAlign = blockBytes / sizeof(DTYPE_FEATURES); - maskAlign = dataAlign * 8; - initTilingData(tiling_data); - calculateReduceSum(); - kernelICAlign = AlignUp(kernelIC, dataAlign); - uint64_t beginOffset = curBlockIdx * coreTask; - - uint32_t valueBlockNum = blockBytes / sizeof(DTYPE_WEIGHT); - uint32_t idxBlockNum = blockBytes / sizeof(DTYPE_INDICES); - - if (curBlockIdx < usedCoreNum - 1) { - taskNum = coreTask; - coreRepeatTimes = repeatTimes; - coreMoveTail = moveTail; - } else { - taskNum = lastCoreTask; - coreRepeatTimes = lastRepeatTimes; - coreMoveTail = lastMoveTail; - } - - featuresGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_FEATURES *>(features) + beginOffset * kernelIC); - indicesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices) + beginOffset * 4); - weightGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_WEIGHT *>(weight)); - - outputFeatureGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_FEATURES *>(feature_out) + beginOffset * kernelSize * kernelOC); - outputIndicesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices_out) + beginOffset * kernelSize); - outputIndicesPairGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_INDICES *>(indices_pair) + beginOffset * kernelSize * 4); - - pipe->InitBuffer(featuresQueue, BUFFER_NUM, AlignUp(kernelIC, valueBlockNum) * moveLen * sizeof(DTYPE_FEATURES)); - pipe->InitBuffer(indicesQueue, BUFFER_NUM, moveLen * 4 * sizeof(DTYPE_INDICES)); - pipe->InitBuffer(weightQueue, BUFFER_NUM, kernelOC * AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_WEIGHT)); - - pipe->InitBuffer(mulTmpUB, AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT)); - pipe->InitBuffer(sumTmpUB, AlignUp(kernelIC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT)); - pipe->InitBuffer(outTmpUB, AlignUp(kernelOC, valueBlockNum) * sizeof(DTYPE_FEATURE_OUT)); - - pipe->InitBuffer(outIndicesUB, moveLen * kernelSize * sizeof(DTYPE_INDICES)); - pipe->InitBuffer(outIndicesPairUB, moveLen * kernelSize * 4 * sizeof(DTYPE_INDICES)); - pipe->InitBuffer(workUB, workSize * sizeof(DTYPE_FEATURE_OUT)); - } - - __aicore__ inline void Process() - { - for (uint32_t i = 0; i < coreRepeatTimes; i++) { - Compute(i); - pipe_barrier(PIPE_ALL); - } - } - -private: - - __aicore__ inline void initTilingData(SparseInverseConv3dTilingData *tiling_data) - { - usedCoreNum = tiling_data->usedCoreNum; - coreTask = tiling_data->coreTask; - lastCoreTask = tiling_data->lastCoreTask; - - moveLen = tiling_data->moveLen; - - repeatTimes = tiling_data->repeatTimes; - moveTail = tiling_data->moveTail; - lastRepeatTimes = tiling_data->lastRepeatTimes; - lastMoveTail = tiling_data->lastMoveTail; - - kernelIC = tiling_data->kernelIC; - kernelOC = tiling_data->kernelOC; - kernelD = tiling_data-> kernelD; - kernelH = tiling_data->kernelH; - kernelW = tiling_data->kernelW; - - outfeatureB = tiling_data->outfeatureB; - outputDepth = tiling_data->outputDepth; - outputHeight = tiling_data->outputHeight; - outputWidth = tiling_data->outputWidth; - kernelSize = tiling_data->kernelSize; - - strideDepth = tiling_data->strideDepth; - strideHeight = tiling_data->strideHeight; - strideWidth = tiling_data->strideWidth; - - paddingDepth = tiling_data->paddingDepth; - paddingHeight = tiling_data->paddingHeight; - paddingWidth = tiling_data->paddingWidth; - - dilationDepth = tiling_data->dilationDepth; - dilationHeight = tiling_data->dilationHeight; - dilationWidth = tiling_data->dilationWidth; - - outputPaddingDepth = tiling_data->outputPaddingDepth; - outputPaddingHeight = tiling_data->outputPaddingHeight; - outputPaddingWidth = tiling_data->outputPaddingWidth; - } - - __aicore__ inline void Compute(uint32_t query) - { - uint32_t taskOffset = query * moveLen; - uint32_t forMoveLen = moveLen; - if (query == coreRepeatTimes - 1) { - forMoveLen = coreMoveTail; - } - - DataCopyExtParams featureCopyParams {(uint16_t)forMoveLen, (uint32_t)(kernelIC * sizeof(DTYPE_WEIGHT)), 0, 0, 0}; - DataCopyExtParams weightCopyParams {(uint16_t)kernelOC, (uint32_t)(kernelIC * sizeof(DTYPE_WEIGHT)), 0, 0, 0}; - DataCopyExtParams indicesCopyParams {1, (uint32_t)(forMoveLen * 4 * sizeof(DTYPE_INDICES)), 0, 0, 0}; - - DataCopyExtParams outCopyParams {1, (uint32_t)(kernelOC * sizeof(DTYPE_WEIGHT)), 0, 0, 0}; - DataCopyExtParams outIndicesCopyParams {1, (uint32_t)(forMoveLen * kernelSize * sizeof(DTYPE_INDICES)), 0, 0, 0}; - DataCopyExtParams outPairCopyParams {1, (uint32_t)(forMoveLen * kernelSize * 4 * sizeof(DTYPE_INDICES)), 0, 0, 0}; - - DataCopyPadExtParams featurePadParams{true, 0, 0, 0}; - DataCopyPadExtParams weightPadParams{true, 0, 0, 0}; - DataCopyPadExtParams indicesPadParams{true, 0, 0, 0}; - - LocalTensor featuresLocal = featuresQueue.AllocTensor(); - LocalTensor indicesLocal = indicesQueue.AllocTensor(); - LocalTensor weightLocal = weightQueue.AllocTensor(); - - LocalTensor mulTemp = mulTmpUB.Get(); - LocalTensor sumTemp = sumTmpUB.Get(); - LocalTensor workLocal = workUB.Get(); - LocalTensor outTmpLocal = outTmpUB.Get(); - - LocalTensor outIndicesTemp = outIndicesUB.Get(); - LocalTensor outIndicesPairTemp = outIndicesPairUB.Get(); - - DTYPE_INDICES onesVal = -1; - Duplicate(outIndicesTemp, onesVal, moveLen * kernelSize); - - event_t eventIDSToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE2)); - event_t eventIDSToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3)); - event_t eventIDMTE2ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_V)); - event_t eventIDMTE2ToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_MTE3)); - event_t eventIDMTE2ToS = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S)); - event_t eventIDVToS = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); - event_t eventIDMTE3ToMTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); - - SetFlag(eventIDSToMTE2); - WaitFlag(eventIDSToMTE2); - DataCopyPad(indicesLocal, indicesGm[taskOffset * 4], indicesCopyParams, indicesPadParams); - pipe_barrier(PIPE_MTE2); - - for (uint32_t i = 0; i < forMoveLen; i++) { - // GetValue feature's locations - int32_t idxOffset = i * 4; - int32_t featureB = indicesLocal.GetValue(idxOffset); - int32_t featureD = indicesLocal.GetValue(idxOffset + 1) * strideDepth - 2 * paddingDepth + outputPaddingDepth; - int32_t featureH = indicesLocal.GetValue(idxOffset + 2) * strideHeight - 2 * paddingHeight + outputPaddingHeight; - int32_t featureW = indicesLocal.GetValue(idxOffset + 3) * strideWidth - 2 * paddingWidth + outputPaddingWidth; - int32_t beginOutputOffset = featureB * outputDepth * outputHeight * outputWidth + featureD * outputHeight * outputWidth + featureH * outputWidth + featureW; - SetFlag(eventIDSToMTE2); - WaitFlag(eventIDSToMTE2); - DataCopyPad(featuresLocal, featuresGm[(taskOffset + i) * kernelIC], featureCopyParams, featurePadParams); - - for (int32_t ix = 0; ix < kernelD; ix++) { - uint32_t xOffset = (uint32_t)ix * dilationDepth * outputHeight * outputWidth; - for (int32_t iy = 0; iy < kernelH; iy++) { - uint32_t yOffset = (uint32_t)iy * dilationHeight * outputWidth; - for (int32_t iz = 0; iz < kernelW; iz++) { - uint32_t zOffset = (uint32_t)iy * dilationWidth; - uint32_t gmOutValueOffset = beginOutputOffset + xOffset + yOffset + zOffset; - uint32_t convOffset = kernelSize - 1 - (ix * kernelH * kernelW + iy * kernelW + iz); - uint32_t weightOffset = convOffset * kernelIC * kernelOC; - SetFlag(eventIDSToMTE2); - WaitFlag(eventIDSToMTE2); - DataCopyPad(weightLocal, weightGm[weightOffset], weightCopyParams, weightPadParams); - SetFlag(eventIDMTE2ToV); - WaitFlag(eventIDMTE2ToV); - for (int32_t oc = 0; oc < kernelOC; oc++) { - Mul(mulTemp, featuresLocal, weightLocal[oc * kernelICAlign], kernelICAlign); - pipe_barrier(PIPE_V); - ReduceSum(sumTemp, mulTemp, workLocal, kernelICAlign); - SetFlag(eventIDVToS); - WaitFlag(eventIDVToS); - outTmpLocal.SetValue(oc, sumTemp.GetValue(0)); - } - int64_t outFeatureOffset = ((taskOffset + i) * kernelSize + convOffset) * kernelOC; - int64_t outInidcesOffset = i * kernelSize + convOffset; - int64_t outInidcesPairOffset = (i * kernelSize + convOffset) * 4; - SetFlag(eventIDSToMTE3); - WaitFlag(eventIDSToMTE3); - DataCopyPad(outputFeatureGm[outFeatureOffset], outTmpLocal, outCopyParams); - outIndicesTemp.SetValue(outInidcesOffset, gmOutValueOffset); - outIndicesPairTemp.SetValue(outInidcesPairOffset, featureB); - outIndicesPairTemp.SetValue(outInidcesPairOffset + 1, featureD + ix * dilationDepth); - outIndicesPairTemp.SetValue(outInidcesPairOffset + 2, featureH + iy * dilationHeight); - outIndicesPairTemp.SetValue(outInidcesPairOffset + 3, featureW + iy * dilationWidth); - } - } - } - pipe_barrier(PIPE_ALL); - } - DataCopyPad(outputIndicesGm[taskOffset * kernelSize], outIndicesTemp, outIndicesCopyParams); - DataCopyPad(outputIndicesPairGm[taskOffset * kernelSize * 4], outIndicesPairTemp, outPairCopyParams); - featuresQueue.FreeTensor(featuresLocal); - indicesQueue.FreeTensor(indicesLocal); - weightQueue.FreeTensor(weightLocal); - } - __aicore__ inline void calculateReduceSum() - { - mulmask = maskAlign; - if (mulmask > kernelIC) { - mulmask = kernelIC; - } - mulRepeatTimes = AlignUp(kernelIC, mulmask); - workSize = AlignUp(mulRepeatTimes, dataAlign); - } - __aicore__ inline uint32_t Max(int32_t a, int32_t b) - { - if (a > b) return a; - return b; - } - __aicore__ inline uint32_t Min(int32_t a, int32_t b) - { - if (a > b) return b; - return a; - } - -private: -// Private Member - TPipe *pipe; - GlobalTensor featuresGm, weightGm, outputFeatureGm; - GlobalTensor indicesGm, outputIndicesGm, outputIndicesPairGm; - - TQue featuresQueue, indicesQueue, weightQueue; - TBuf mulTmpUB, sumTmpUB, workUB, outTmpUB, outIndicesUB, outIndicesPairUB; - - uint32_t usedCoreNum; - uint32_t coreTask; - uint32_t lastCoreTask; - - uint32_t moveLen; - - uint32_t repeatTimes; - uint32_t moveTail; - uint32_t lastRepeatTimes; - uint32_t lastMoveTail; - - uint32_t kernelOC; - uint32_t kernelIC; - uint32_t kernelD; - uint32_t kernelH; - uint32_t kernelW; - uint32_t kernelSize; - - uint32_t outfeatureB; - uint32_t outputDepth; - uint32_t outputHeight; - uint32_t outputWidth; - - uint32_t strideDepth; - uint32_t strideHeight; - uint32_t strideWidth; - - uint32_t paddingDepth; - uint32_t paddingHeight; - uint32_t paddingWidth; - - uint32_t dilationDepth; - uint32_t dilationHeight; - uint32_t dilationWidth; - - uint32_t outputPaddingDepth; - uint32_t outputPaddingHeight; - uint32_t outputPaddingWidth; - - uint32_t blockBytes{32}; - uint32_t curBlockIdx; - uint32_t dataAlign; - uint32_t taskNum; - uint32_t coreRepeatTimes; - uint32_t coreMoveTail; - uint32_t kernelICAlign; - uint32_t maskAlign; - uint32_t mulmask; - uint32_t mulRepeatTimes; - uint32_t workSize; -}; -extern "C" __global__ __aicore__ void sparse_inverse_conv3d(GM_ADDR features, GM_ADDR indices, GM_ADDR weight, GM_ADDR feature_out, GM_ADDR indices_out, GM_ADDR indices_pair, GM_ADDR workspace, GM_ADDR tiling) { - SetSysWorkspace(workspace); - GET_TILING_DATA(tiling_data, tiling); - TPipe pipe; - KernelSparseInverseConv3d op; - op.Init(features, indices, weight, feature_out, indices_out, indices_pair, workspace, &tiling_data, &pipe); - op.Process(); -} diff --git a/mx_driving/_C/__init__.pyi b/mx_driving/_C/__init__.pyi index 7876d7be..e27127bb 100644 --- a/mx_driving/_C/__init__.pyi +++ b/mx_driving/_C/__init__.pyi @@ -196,19 +196,6 @@ def npu_sparse_conv3d( outSpatialShape: Tuple[int, int, int], batch_size: int, ) -> Tuple[torch.Tensor, torch.Tensor]: ... -def npu_sparse_inverse_conv3d( - feature: torch.Tensor, - indices: torch.Tensor, - weight: torch.Tensor, - kernel_size: Tuple[int, int, int], - stride: Tuple[int, int, int], - padding: Tuple[int, int, int], - dilation: Tuple[int, int, int], - output_padding: Tuple[int, int, int], - out_channel: int, - outSpatialShape: Tuple[int, int, int], - batch_size: int, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ... def npu_sparse_conv3d_grad( indices_offset: torch.Tensor, former_sorted_indices: torch.Tensor, diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py index 213f811f..2030cce9 100644 --- a/mx_driving/__init__.py +++ b/mx_driving/__init__.py @@ -1,7 +1,6 @@ __all__ = [ "RoIPointPool3d", "SparseConv3d", - "SparseInverseConv3d", "SubMConv3d", "SparseConvTensor", "SparseModule", @@ -66,7 +65,7 @@ import os import mx_driving._C from .modules.roi_point_pool_3d import RoIPointPool3d -from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d +from .modules.sparse_conv import SparseConv3d, SubMConv3d from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential from .modules.voxelization import Voxelization from .ops.assign_score_withk import assign_score_withk diff --git a/mx_driving/csrc/SparseInverseConv3d.cpp b/mx_driving/csrc/SparseInverseConv3d.cpp deleted file mode 100644 index e4ad5023..00000000 --- a/mx_driving/csrc/SparseInverseConv3d.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2024 Huawei Technologies Co., Ltd -// Copyright (c) 2019, Facebook CORPORATION. -// All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "csrc/OpApiCommon.h" -#include "csrc/functions.h" - -std::tuple npu_sparse_inverse_conv3d(const at::Tensor& feature, - const at::Tensor& indices, const at::Tensor& weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, - at::IntArrayRef padding, at::IntArrayRef dilation, at::IntArrayRef output_padding, int out_channel, - at::IntArrayRef outSpatialShape, int batch_size) -{ - // check Tensor Device is NPU - TORCH_CHECK_NPU(feature); - TORCH_CHECK_NPU(indices); - TORCH_CHECK_NPU(weight); - - // Calculate kernelSize - int64_t kernelsum = 1; - for (int32_t i = 0; i < kernel_size.size(); i++) { - kernelsum *= kernel_size[i]; - } - // to create memory of teh output - auto indices_size = indices.sizes(); - int64_t outputsum = indices_size[0] * kernelsum; - c10::SmallVector output_size = {outputsum, out_channel}; - c10::SmallVector indices_out_size = {outputsum}; - c10::SmallVector indices_pairs_size = {outputsum, indices_size[1]}; - - at::Tensor out = at::empty(output_size, feature.options()); - at::Tensor indices_out = at::empty(indices_out_size, indices.options()).fill_(-1); - at::Tensor indices_pairs = at::empty(indices_pairs_size, indices.options()).fill_(-1); - - c10::SmallVector spatial_size = { - batch_size, outSpatialShape[0], outSpatialShape[1], outSpatialShape[2], out_channel}; - at::IntArrayRef outputShape = at::IntArrayRef(spatial_size); - // weight [,,,in_channels, out_channels] -> [,,,out_channels, in_channels] - at::Tensor weight_trans = weight.transpose(-1, -2).contiguous(); - EXEC_NPU_CMD(aclnnSparseInverseConv3d, feature, indices, weight_trans, outputShape, stride, padding, dilation, - output_padding, out, indices_out, indices_pairs); - return std::tie(out, indices_pairs, indices_out); -} diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp index 04bb3285..dc4f5b2e 100644 --- a/mx_driving/csrc/pybind.cpp +++ b/mx_driving/csrc/pybind.cpp @@ -184,9 +184,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) // npu_sparse_conv3d m.def("npu_sparse_conv3d", &npu_sparse_conv3d); - // npu_sparse_inverse_conv3d - m.def("npu_sparse_inverse_conv3d", &npu_sparse_inverse_conv3d); - // multi_to_sparse m.def("multi_to_sparse", &multi_to_sparse); diff --git a/mx_driving/modules/sparse_conv.py b/mx_driving/modules/sparse_conv.py index 62ee6b24..420ed789 100644 --- a/mx_driving/modules/sparse_conv.py +++ b/mx_driving/modules/sparse_conv.py @@ -186,29 +186,7 @@ class SparseConvolution(SparseModule): def forward(self, input_): if not isinstance(input_, SparseConvTensor): raise RuntimeError("input_ is not SparseConvTensor") - if self.inverse: - out_spatial_shape = get_inverse_conv_output_size( - input_.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation, self.output_padding - ) - out_spatial_shape = [int(i) for i in out_spatial_shape] - if not isinstance(out_spatial_shape, list): - out_spatial_shape = out_spatial_shape.tolist() - out_features, outidx = Fsp.indice_inverse_conv( - input_.features, - input_.indices, - self.weight, - out_spatial_shape, - self.out_channels, - input_.batch_size, - self.kernel_size, - self.stride, - self.padding, - self.dilation, - self.output_padding, - self.groups, - self.bias, - ) - elif not self.subm: + if not self.subm: out_spatial_shape = get_conv_output_size( input_.spatial_shape, self.kernel_size, self.stride, self.padding, self.dilation ) @@ -230,7 +208,7 @@ class SparseConvolution(SparseModule): self.bias, ) else: - + out_spatial_shape = input_.spatial_shape out_spatial_shape = [int(i) for i in out_spatial_shape] if not isinstance(out_spatial_shape, list): @@ -335,34 +313,3 @@ class SubMConv3d(SparseConvolution): mode=mode, ) - -class SparseInverseConv3d(SparseConvolution): - def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - inverse=True, - indice_key=None, - mode="mmcv", - ): - super().__init__( - 3, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - bias, - subm=False, - inverse=True, - indice_key=indice_key, - mode=mode, - ) diff --git a/mx_driving/ops/sparse_functional.py b/mx_driving/ops/sparse_functional.py index b7018ed8..5a1906e5 100644 --- a/mx_driving/ops/sparse_functional.py +++ b/mx_driving/ops/sparse_functional.py @@ -76,69 +76,6 @@ class SparseConvFunction(Function): return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None -class SparseInverseConvFunction(Function): - - @staticmethod - # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward( - ctx: Any, - features, - indices, - weight, - out_spatial_shape, - out_channels, - batch_size, - kernel_size, - stride, - padding, - dilation, - output_padding, - groups, - bias, - ) -> torch.Tensor: - device = features.device - weight = weight.data - # calculate the index pair - out_features, outidx_pair, ouidx_offset = mx_driving._C.npu_sparse_inverse_conv3d( - features, - indices, - weight, - kernel_size, - stride, - padding, - dilation, - output_padding, - out_channels, - out_spatial_shape, - batch_size, - ) - # sort and nonezero - to_insert = torch.tensor(-1).to(device) - sorted_idx, sorted_idx_to_former_indices = torch.sort(ouidx_offset.view(torch.float32)) - new_sorted_idx = torch.cat((to_insert.view(1), sorted_idx.view(torch.int32)), 0) - new_sorted_idx_2 = torch.cat((sorted_idx.view(torch.int32), to_insert.view(1)), 0) - sub_result = new_sorted_idx - new_sorted_idx_2 - unique_indices_offset = torch.nonzero(sub_result != 0) - # matmul - out_features, outidx = mx_driving._C.multi_to_sparse( - out_features, unique_indices_offset.int(), sorted_idx_to_former_indices.int(), outidx_pair.int() - ) - outidx, outidx_ = torch.chunk(outidx, 2, dim=1) - - ctx.save_for_backward(features, weight, sorted_idx_to_former_indices.int(), unique_indices_offset.int()) - return out_features, outidx - - @staticmethod - @once_differentiable - # pylint: disable=too-many-return-values - def backward(ctx: Any, grad_out_features: torch.Tensor, grad_outidx=None) -> tuple: - features, weight, sorted_idx_to_former_indices, unique_indices_offset = ctx.saved_tensors - feature_grad, weight_grad = mx_driving._C.npu_sparse_conv3d_grad( - unique_indices_offset, sorted_idx_to_former_indices, features, weight, grad_out_features - ) - return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None - - class SubMConvFunction(Function): @staticmethod @@ -199,8 +136,8 @@ class SubMConvFunction(Function): weight = weight.permute(0, 1, 2, 4, 3).contiguous() weight_permute = weight.view(kernel_num*weight_shape[4], weight_shape[3]) feature_grad = grad_out_features_iml2col @ weight_permute - - + + return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None @@ -254,11 +191,10 @@ class SubMConvWithKeyFunction(Function): weight = weight.permute(0, 1, 2, 4, 3).contiguous() weight_permute = weight.view(kernel_num*weight_shape[4], weight_shape[3]) feature_grad = grad_out_features_iml2col @ weight_permute - - + + return feature_grad, None, weight_grad, None, None, None, None, None, None, None, None, None, None indice_conv = SparseConvFunction.apply -indice_inverse_conv = SparseInverseConvFunction.apply indice_subm_conv = SubMConvFunction.apply -indice_subm_conv_with_key = SubMConvWithKeyFunction.apply \ No newline at end of file +indice_subm_conv_with_key = SubMConvWithKeyFunction.apply diff --git a/mx_driving/spconv.py b/mx_driving/spconv.py index c9e9ba98..15df8265 100644 --- a/mx_driving/spconv.py +++ b/mx_driving/spconv.py @@ -1,6 +1,6 @@ import warnings -from .modules.sparse_conv import SparseConv3d, SparseInverseConv3d, SubMConv3d +from .modules.sparse_conv import SparseConv3d, SubMConv3d from .modules.sparse_modules import SparseConvTensor, SparseModule, SparseSequential warnings.warn( diff --git a/tests/torch/test_sparse_inverse_conv3d.py b/tests/torch/test_sparse_inverse_conv3d.py deleted file mode 100644 index 95bcaca0..00000000 --- a/tests/torch/test_sparse_inverse_conv3d.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2024, Huawei Technologies.All rights reserved. - -"""Compare results between different algos: -CPU: simple gather-mm-scatter -Native: Fused gather-mm-scatter -ImplicitGemm: implicit gemm -""" - -import time -from pathlib import Path - -import numpy as np -import torch -import torch_npu -from torch import nn -from mx_driving.spconv import SparseSequential, SparseConvTensor, SparseInverseConv3d - - -def generate_sparse_data(shape, - num_points, - num_channels, - integer=False, - data_range=(-1, 1), - with_dense=True, - dtype=np.float32, - shape_scale=1): - dense_shape = shape - ndim = len(dense_shape) - # num_points = np.random.randint(10, 100, size=[batch_size, ndim]) - num_points = np.array(num_points) - # num_points = np.array([3, 2]) - batch_size = len(num_points) - batch_indices = [] - coors_total = np.stack(np.meshgrid(*[np.arange(0, s // shape_scale) for s in shape]), - axis=-1) - coors_total = coors_total.reshape(-1, ndim) * shape_scale - for i in range(batch_size): - np.random.shuffle(coors_total) - inds_total = coors_total[:num_points[i]] - inds_total = np.pad(inds_total, ((0, 0), (0, 1)), - mode="constant", - constant_values=i) - batch_indices.append(inds_total) - if integer: - sparse_data = np.random.randint(data_range[0], - data_range[1], - size=[num_points.sum(), - num_channels]).astype(dtype) - else: - sparse_data = np.random.uniform(data_range[0], - data_range[1], - size=[num_points.sum(), - num_channels]).astype(dtype) - - # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1) - - res = { - "features": sparse_data.astype(dtype), - } - if with_dense: - dense_data = np.zeros([batch_size, num_channels, *dense_shape], - dtype=sparse_data.dtype) - start = 0 - for i, inds in enumerate(batch_indices): - for j, ind in enumerate(inds): - dense_slice = (i, slice(None), *ind[:-1]) - dense_data[dense_slice] = sparse_data[start + j] - start += len(inds) - res["features_dense"] = dense_data.astype(dtype) - batch_indices = np.concatenate(batch_indices, axis=0) - res["indices"] = batch_indices.astype(np.int32) - return res - - -class Net(nn.Module): - def __init__(self, shape): - super().__init__() - self.net = SparseSequential( - SparseInverseConv3d(16, 32, 3) - ) - max_batch_size = 1 - self.shape = shape - - def forward(self, features, coors, batch_size): - x = SparseConvTensor(features, - coors, - self.shape, - batch_size) - return self.net(x) - - -def _test_multi_impl(spatial_shape, feature_num, dtype: torch.dtype): - - np.random.seed(50051) - - spatial_shape = [4, 4, 4] - sparse_dict = generate_sparse_data(spatial_shape, [feature_num] * 1, 16) - - voxels = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32) - coors = np.ascontiguousarray( - sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) - device = torch.device("npu:0") - - voxels_th_npu = torch.from_numpy(voxels).to(device).to(dtype) - - coors_th_npu = torch.from_numpy(coors).to(device) - net_cls = Net - # npu - torch.manual_seed(50051) - net_native_npu = net_cls(spatial_shape).to(device).to(dtype) - - out = net_native_npu(voxels_th_npu, coors_th_npu, 1) - - -def test_multi_impl(): - _test_multi_impl([4, 4, 4], 3, torch.float32) - _test_multi_impl([7, 7, 7], 9, torch.float32) - _test_multi_impl([12, 13, 14], 100, torch.float32) - _test_multi_impl([25, 25, 25], 400, torch.float32) - - -# if __name__ == "__main__": -# test_multi_impl() \ No newline at end of file -- Gitee