diff --git a/docs/api/README.md b/docs/api/README.md
index d335b44ed1298c295ad053e04c7e3a35e97c8b57..5abc02091d6418491d2b10426abeea56424e4bac 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -54,7 +54,7 @@
Y |
- 采样 |
+ 采样 |
roipoint_pool3d |
Y |
@@ -94,6 +94,10 @@
grid_sampler2d_v2 |
N |
+
+ npu_unique |
+ N |
+
体素化 |
voxelization |
diff --git a/docs/api/context/npu_unique.md b/docs/api/context/npu_unique.md
new file mode 100644
index 0000000000000000000000000000000000000000..843be385f4622ca7b453e2413f3b2c8fcca023e8
--- /dev/null
+++ b/docs/api/context/npu_unique.md
@@ -0,0 +1,37 @@
+## npu_unique[beta]
+
+### 接口原型
+
+```python
+mx_driving.npu_unique(Tensor input) -> Tensor
+```
+
+### 功能描述
+
+从小到大排序并去重. 提供一个输入`tensor`, 对`tensor`的输入进行排序, 并去掉`tensor`中的重复元素.
+
+### 参数说明
+
+- `input(Tensor)`:表示输入张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`. shape 为 1 ~ 8 维的任意shape.
+
+### 返回值
+
+- `output(Tensor)`:表示输出张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`, 与输入张量`input`一致. shape 为 1 维。
+
+### 约束说明
+
+- int32, int64输入时, 每个元素的值须在[-16777216, 16777216] (±2^24)之间,否则会引入精度损失.
+
+### 支持的型号
+
+- Atlas A2 训练系列产品
+
+### 调用示例
+
+```python
+import torch, torch_npu
+from mx_driving import npu_unique
+
+rand_tensor = torch.rand(559794, dtype=torch.int64)
+output = npu_unique(rand_tensor.npu())
+```
diff --git a/include/csrc/functions.h b/include/csrc/functions.h
index bdc71908ac93a3cc5ca4493b65a108a69eb1da45..e2d81d8052c4c60e792330feef01fe83cafbae8e 100644
--- a/include/csrc/functions.h
+++ b/include/csrc/functions.h
@@ -288,6 +288,7 @@ std::tuple calc_poly_start_end_sl(const at::
at::Tensor npu_subm_sparse_conv3d_with_key(const at::Tensor& ouidx_offset, const at::Tensor& valid_indices,
const at::Tensor& weight, const at::Tensor& feature, int indices_number,
at::IntArrayRef kernel_size);
+at::Tensor npu_unique(const at::Tensor& input);
at::Tensor min_area_polygons(const at::Tensor& pointsets);
#endif // CSRC_FUNCTIONS_H_
diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc04d9e1fbd55abaf4db798decae5eb5bdfbe068
--- /dev/null
+++ b/kernels/op_host/unique.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ *
+ */
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "unique_tiling.h"
+constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024;
+
+
+namespace optiling {
+static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context)
+{
+ if (!context) {
+ return ge::GRAPH_FAILED;
+ }
+ UniqueTilingData tiling;
+
+ constexpr uint16_t tileLength = 8192;
+ const gert::StorageShape* inputShape = context->GetInputShape(0);
+ if (!inputShape) {
+ return ge::GRAPH_FAILED;
+ }
+ const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
+ uint32_t totalLength = 1;
+ for (int i = 0; i < dimNum; i++) {
+ totalLength *= inputShape->GetStorageShape().GetDim(i);
+ }
+ const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength;
+ const uint16_t tailLength = totalLength % tileLength;
+ const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+ const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+ const uint8_t blockNum = tileNum >= aivNum ? aivNum : tileNum;
+ const uint32_t shortBlockTileNum = tileNum / blockNum;
+ const uint8_t longBlockNum = tileNum % blockNum;
+ const uint8_t shortBlockNum = blockNum - longBlockNum;
+
+ tiling.set_totalLength(totalLength);
+ tiling.set_tileNum(tileNum);
+ tiling.set_shortBlockTileNum(shortBlockTileNum);
+ tiling.set_tailLength(tailLength);
+ tiling.set_blockNum(blockNum);
+ tiling.set_shortBlockNum(shortBlockNum);
+
+ context->SetBlockDim(blockNum);
+ if (context->GetRawTilingData() == nullptr) {
+ return ge::GRAPH_FAILED;
+ }
+ tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+ context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+ // Workspace for IBSet/IBWait up to 8 times, and 2 times full data.
+ uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+ auto&& currentWorkspace = context->GetWorkspaceSizes(1);
+ if (currentWorkspace == nullptr) {
+ return ge::GRAPH_FAILED;
+ }
+ size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2;
+ currentWorkspace[0] = usrSize + sysWorkspaceSize;
+ return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+
+namespace ge {
+static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context)
+{
+ const gert::Shape* x1_shape = context->GetInputShape(0);
+ gert::Shape* y_shape = context->GetOutputShape(0);
+ if (!x1_shape || !y_shape) {
+ return GRAPH_FAILED;
+ }
+ *y_shape = *x1_shape;
+ return GRAPH_SUCCESS;
+}
+
+static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context)
+{
+ auto inputDtype = context->GetInputDataType(0);
+ context->SetOutputDataType(0, inputDtype);
+ return ge::GRAPH_SUCCESS;
+}
+} // namespace ge
+
+
+namespace ops {
+class Unique : public OpDef {
+public:
+ explicit Unique(const char* name) : OpDef(name)
+ {
+ this->Input("input")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64})
+ .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+ .IgnoreContiguous();
+ this->Output("output")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64})
+ .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+ this->Output("uniqueCnt")
+ .ParamType(REQUIRED)
+ .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+ .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+ this->SetInferShape(ge::UniqueInferShape);
+ this->SetInferDataType(ge::UniqueInferDtype);
+
+ this->AICore().SetTiling(optiling::UniqueTilingFunc);
+ this->AICore().AddConfig("ascend910b");
+ this->AICore().AddConfig("ascend910_93");
+ }
+};
+
+OP_ADD(Unique);
+} // namespace ops
diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed40f6f6f9895b83637ac2e3920a328e9c0303f
--- /dev/null
+++ b/kernels/op_host/unique_tiling.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ *
+ */
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(UniqueTilingData)
+ TILING_DATA_FIELD_DEF(uint32_t, totalLength);
+ TILING_DATA_FIELD_DEF(uint32_t, tileNum);
+ TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum);
+ TILING_DATA_FIELD_DEF(uint16_t, tailLength);
+ TILING_DATA_FIELD_DEF(uint8_t, blockNum);
+ TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData)
+}
diff --git a/kernels/op_kernel/unique.cpp b/kernels/op_kernel/unique.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9cedcf5172f6cbbe5332fdbe9e5f15c10e506f0
--- /dev/null
+++ b/kernels/op_kernel/unique.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "unique.h"
+
+
+extern "C" __global__ __aicore__ void unique(
+ GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, GM_ADDR tiling) {
+ GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace);
+ GET_TILING_DATA(tiling_data, tiling);
+ TPipe pipe;
+ KernelUnique op(pipe);
+ op.Init(input,
+ output,
+ uniqueCnt,
+ usrWorkspace,
+ tiling_data.totalLength,
+ tiling_data.tileNum,
+ tiling_data.shortBlockTileNum,
+ tiling_data.tailLength,
+ tiling_data.blockNum,
+ tiling_data.shortBlockNum);
+ op.Process();
+}
diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..9470d73b2bda7fe62f29a8feb246eba94f55e834
--- /dev/null
+++ b/kernels/op_kernel/unique.h
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+using namespace AscendC;
+
+
+namespace AscendC {
+template
+__aicore__ inline Ta min(const Ta a, const Tb b)
+{
+ if (a > b) {
+ return b;
+ }
+ return a;
+}
+
+template
+__aicore__ inline Ta max(const Ta a, const Tb b)
+{
+ if (a < b) {
+ return b;
+ }
+ return a;
+}
+
+template
+class KernelUnique {
+public:
+ __aicore__ inline KernelUnique(TPipe& pipe) : pipe(pipe) {}
+ // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
+ __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx);
+ __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
+ const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum,
+ const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum);
+ __aicore__ inline void Process();
+
+private:
+ __aicore__ inline void CopyIn(const int32_t progress);
+ __aicore__ inline void Elem32Sort(const int32_t progress);
+ __aicore__ inline void TileSort(const int32_t progress);
+ template
+ __aicore__ inline static void DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src,
+ const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength);
+ using GMSSrcList = GlobalTensor (&)[4];
+ struct GMSParams {
+ int (&GMSLengths)[4];
+ uint8_t& queNum;
+ LocalTensor (&&buffLocal)[5];
+ };
+ __aicore__ inline static void MrgSortGM(GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params);
+ __aicore__ inline void BlockSortV2();
+ __aicore__ inline void GlobalSortV2();
+ __aicore__ inline static void ConsecutiveUnique(const LocalTensor& dstVal,
+ const LocalTensor& srcLocal, const LocalTensor& shiftedLocal,
+ const LocalTensor& bitMask16, const uint16_t elemLength, uint64_t& tileUniqueCnt);
+ __aicore__ inline void TileUnique(const int32_t progress);
+ __aicore__ inline void CopyOut();
+
+private:
+ static constexpr int32_t TILE_LENGTH = 8192;
+ // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique.
+ static constexpr float FLOAT_INF = 3e+99;
+ // Indicates the factor converting float to data structure used by Sort32&MrgSort.
+ static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t); // 8
+ static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2
+ static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE; // 8192 * 8 = 65536
+ static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR; // 8192 * 2 = 16384
+ static constexpr uint16_t VALID_QUE[5] = {
+ 0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort.
+
+ TPipe& pipe;
+ TQue calcBuf[3];
+
+ GlobalTensor srcGlobal;
+ GlobalTensor srcGlobalAsUint;
+ GlobalTensor dstGlobal1;
+ GlobalTensor dstGlobal1As32;
+ GlobalTensor uniqueCntGlobal;
+
+ GlobalTensor sortedBlock1;
+ GlobalTensor sortedBlock1AsInt;
+ GlobalTensor sortedBlock2;
+ GlobalTensor sortedBlock2AsInt;
+ GlobalTensor sortedGlobal1;
+ GlobalTensor sortedGlobal2;
+
+ GlobalTensor IBSyncGlobal;
+ GlobalTensor blockUniqueCntGlobal;
+
+ uint16_t syncWorkspaceSize;
+ uint8_t eventID {0};
+ uint64_t accUniqueCnt {0};
+ float lastTileUniqueVal;
+
+ uint32_t totalLength;
+ uint32_t alignedTotalLength;
+ uint32_t tileNum;
+ uint32_t shortBlockTileNum;
+ uint16_t tailLength;
+ uint8_t blockNum;
+ uint8_t shortBlockNum;
+
+ size_t globalOffset; // Offset of data for current block.
+ size_t blockLength; // Length of current block.
+ bool hasInfFlag {false};
+};
+
+// Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
+template
+__aicore__ inline size_t KernelUnique::GetGlobalOffset(const uint32_t blockIdx)
+{
+ // (shortBlockTileNum + 1) indicates longBlockTileNum.
+ const size_t offset =
+ (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) +
+ (this->shortBlockTileNum + 1) * (this->shortBlockNum >= blockIdx ? 0 : blockIdx - this->shortBlockNum)) *
+ TILE_LENGTH;
+ return offset;
+}
+
+template
+__aicore__ inline void KernelUnique::Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
+ const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum,
+ const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum)
+{
+ this->totalLength = totalLength;
+ this->alignedTotalLength = totalTileNum * TILE_LENGTH;
+ this->shortBlockTileNum = shortBlockTileNum;
+ this->tailLength = tailLength;
+ this->blockNum = blockNum;
+ this->shortBlockNum = shortBlockNum;
+
+ const bool isShortBlock = this->shortBlockNum > GetBlockIdx();
+ // (shortBlockTileNum + 1) indicates longBlockTileNum.
+ this->tileNum = isShortBlock ? shortBlockTileNum : shortBlockTileNum + 1;
+ this->blockLength = this->tileNum * TILE_LENGTH;
+ this->globalOffset = GetGlobalOffset(GetBlockIdx());
+
+ srcGlobal.SetGlobalBuffer((__gm__ T*)input + globalOffset, this->blockLength);
+ srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t*)input + globalOffset * sizeof(T) / sizeof(uint32_t),
+ this->blockLength * sizeof(T) / sizeof(uint32_t));
+ dstGlobal1.SetGlobalBuffer((__gm__ T*)output, this->alignedTotalLength);
+ dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t*)output, this->alignedTotalLength * sizeof(T) / sizeof(int32_t));
+ uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t*)uniqueCnt, 1);
+
+ // sortedBlock is offsetted, and could only see the data that this block should process.
+ sortedBlock1.SetGlobalBuffer((__gm__ float*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+ this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+ sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+ this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+ sortedBlock2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+ globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+ this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+ sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+ globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+ this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+ // sortedGlobal could see all data in the workspace.
+ sortedGlobal1.SetGlobalBuffer((__gm__ float*)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+ sortedGlobal2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR,
+ alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+
+ // Buff size for syncronizing according to document of IBWait&IBSet.
+ this->syncWorkspaceSize = (blockNum * 32 + 1) * 8;
+ IBSyncGlobal.SetGlobalBuffer(
+ (__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, syncWorkspaceSize);
+ blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t*)workspace + alignedTotalLength * 4 + syncWorkspaceSize,
+ (blockNum + 7) / 8 * 8); // Length aligned up to 32B.
+
+ pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE);
+ pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE);
+ pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE);
+}
+
+template
+__aicore__ inline void KernelUnique::Process()
+{
+ LocalTensor IBSyncLocal;
+ // Initialize sync buff.
+ if (GetBlockIdx() == 0) {
+ IBSyncLocal = calcBuf[0].AllocTensor();
+ Duplicate(IBSyncLocal, 0, syncWorkspaceSize);
+ PipeBarrier();
+ DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize);
+ PipeBarrier();
+ calcBuf[0].FreeTensor(IBSyncLocal);
+ } // Initialize sync buff.
+
+ // Sort within each tile.
+ for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
+ CopyIn(tileIdx);
+ Elem32Sort(tileIdx);
+ TileSort(tileIdx);
+ }
+
+ if (GetBlockNum() > 1) {
+ if (this->tileNum > 1) {
+ BlockSortV2(); // Sort within each block.
+ }
+
+ GlobalSortV2(); // Sort globally.
+
+ PipeBarrier();
+ SyncAll();
+ PipeBarrier();
+ }
+
+ // Check if an inf value exists. If do, inf will be append to the result in TileUnique().
+ if ((IsSameType::value || IsSameType::value || IsSameType::value) &&
+ GetBlockIdx() == blockNum - 1) {
+ PipeBarrier();
+ const uint32_t totalLength32BAligned = (totalLength * sizeof(T) + 31) / 32 * 32 / sizeof(float);
+ if (sortedGlobal1.GetValue((totalLength32BAligned - 1) * 2) == -FLOAT_INF) {
+ hasInfFlag = true;
+ }
+ }
+
+ // Do unique in each block based on tiles.
+ for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
+ TileUnique(tileIdx);
+ }
+
+ if (this->blockNum > 1) {
+ // Each block waits for its former block to upload blockUniqueCnt.
+ IBSyncLocal = calcBuf[0].AllocTensor();
+ if (GetBlockIdx() != 0) {
+ PipeBarrier();
+ IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx() - 1, eventID);
+ PipeBarrier();
+ }
+ PipeBarrier();
+ IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
+ PipeBarrier();
+ calcBuf[0].FreeTensor(IBSyncLocal);
+ }
+
+ // Gather result from every block.
+ CopyOut();
+}
+
+template
+__aicore__ inline void KernelUnique::CopyIn(const int32_t progress)
+{
+ LocalTensor srcLocal = calcBuf[0].AllocTensor();
+ LocalTensor sortedLocal2 = calcBuf[2].AllocTensor();
+
+ // To process tail, fill the whole tile with INF, then cover it with tail.
+ int32_t castLen; // Valid length of the last block.
+ if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) {
+ // Must determine during compilation, otherwise we get a compilation error.
+ if constexpr (!IsSameType::value) {
+ DataCopy(srcLocal, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
+ } else {
+ DataCopy(sortedLocal2, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
+ }
+ castLen = TILE_LENGTH;
+ } else {
+ // Process tail.
+ LocalTensor srcAsUint = srcLocal.template ReinterpretCast();
+ Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH);
+ PipeBarrier();
+ if constexpr (IsSameType::value) {
+ DataCopyPad(sortedLocal2, srcGlobal[progress * TILE_LENGTH],
+ {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
+ } else if constexpr (sizeof(T) >= sizeof(float)) {
+ DataCopyPad(srcAsUint, srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)],
+ {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
+ } else {
+ DataCopyPad(srcLocal, srcGlobal[progress * TILE_LENGTH],
+ {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
+ }
+ castLen = tailLength;
+ }
+ PipeBarrier();
+ if constexpr (!IsSameType::value) {
+ if constexpr (sizeof(T) >= sizeof(float)) {
+ Cast(sortedLocal2, srcLocal, RoundMode::CAST_ROUND, castLen);
+ } else {
+ Cast(sortedLocal2, srcLocal, RoundMode::CAST_NONE, castLen);
+ }
+ PipeBarrier();
+ }
+ Muls(sortedLocal2, sortedLocal2, (float)-1, TILE_LENGTH);
+ calcBuf[0].EnQue(srcLocal);
+ calcBuf[2].EnQue(sortedLocal2);
+}
+
+template
+__aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress)
+{
+ LocalTensor srcLocal = calcBuf[0].DeQue();
+ LocalTensor sortedLocal1 = calcBuf[1].AllocTensor();
+ LocalTensor sortedLocal2 = calcBuf[2].DeQue();
+ LocalTensor arithLocal = srcLocal.template ReinterpretCast()[TILE_LENGTH];
+
+ int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset
+ Duplicate(arithLocal, baseOffset, TILE_LENGTH);
+ PipeBarrier();
+
+ LocalTensor uidArray = arithLocal.template ReinterpretCast();
+ // Max repeatTime of Sort32 is 255, which is exceeded because TILE_LENGTH is 8192.
+ constexpr uint8_t sort32BatchSize = 32;
+ constexpr uint8_t sort32RepeatLimit = 255;
+ int instrRepeatTime = 0;
+ int restLen = TILE_LENGTH;
+ while (restLen) {
+ int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit);
+ Sort32(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime],
+ sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
+ uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], repTime);
+ restLen -= repTime * sort32BatchSize;
+ instrRepeatTime++;
+ }
+ PipeBarrier();
+ calcBuf[0].FreeTensor(srcLocal);
+ calcBuf[1].EnQue(sortedLocal1);
+ calcBuf[2].EnQue(sortedLocal2);
+}
+
+template
+__aicore__ inline void KernelUnique::TileSort(const int32_t progress)
+{
+ LocalTensor sortedLocal1 = calcBuf[1].DeQue();
+ LocalTensor sortedLocal2 = calcBuf[2].DeQue();
+ LocalTensor sortedQue[2] = {sortedLocal1, sortedLocal2};
+ uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32.
+ uint16_t currentQueNum = TILE_LENGTH / currentQueLength;
+ bool switchFlag = false;
+ // Multiple MrgSort until we have one generally sorted tile.
+ while (currentQueLength < TILE_LENGTH) {
+ const uint16_t elementLengths[4] = {currentQueLength, currentQueLength, currentQueLength, currentQueLength};
+ const uint16_t fullMrgSortTime = currentQueNum / 4;
+ if (fullMrgSortTime > 0) {
+ MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime};
+ MrgSort(sortedQue[!switchFlag],
+ {sortedQue[switchFlag][0], sortedQue[switchFlag][currentQueLength * 1 * 2],
+ sortedQue[switchFlag][currentQueLength * 2 * 2], sortedQue[switchFlag][currentQueLength * 3 * 2]},
+ params);
+ PipeBarrier();
+ switchFlag = !switchFlag;
+ }
+ currentQueNum = fullMrgSortTime;
+ currentQueLength *= 4;
+ }
+ DataCopy(sortedBlock1[progress * TILE_LEN_ELEM], sortedQue[switchFlag], TILE_LEN_ELEM);
+ PipeBarrier();
+ calcBuf[1].FreeTensor(sortedLocal1);
+ calcBuf[2].FreeTensor(sortedLocal2);
+}
+
+template
+template
+__aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src,
+ const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength)
+{
+ // Max byte size of DataCopyPad in one repeat is 65535.
+ int bufElemLength = min(bufByteLength, 65535) / sizeof(T1);
+ int restLen = elemLength;
+ while (restLen > 0) {
+ int copyLen = min(restLen, bufElemLength);
+ DataCopyPad(tmpLocal, src[elemLength - restLen], {1, static_cast(sizeof(T1) * copyLen), 0, 0},
+ {false, 0, 0, 0});
+ PipeBarrier();
+ DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast(sizeof(T1) * copyLen), 0, 0});
+ PipeBarrier();
+ restLen -= copyLen;
+ }
+}
+
+template
+__aicore__ inline void KernelUnique::MrgSortGM(
+ GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params)
+{
+ int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]};
+ int currentHead[4] {};
+ int totalMrgLen {};
+ uint8_t queNum = params.queNum;
+ // limited by MrgSort api constraint and mrgLocal size, we set different buffer length due to diffent queNum.
+ // mrgLocal contains 8192 elems, and MrgSort limits max 4095 elems per queue.
+ constexpr int BUFFER_LEN[5] {0, 0, 4095, 2730, 2048};
+ uint16_t sortedLen[4];
+ uint16_t mrgLen[4] {};
+ while (queNum > 1) {
+ int currentBufferLen = BUFFER_LEN[queNum];
+ for (int i = 0; i < queNum; i++) {
+ mrgLen[i] = min(restLen[i], currentBufferLen);
+ }
+ // CopyIn
+ for (int i = 0; i < queNum; i++) {
+ DataCopyPad(params.buffLocal[i], srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
+ {1, static_cast(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0},
+ {false, 0, 0, 0});
+ }
+ PipeBarrier();
+ // MrgSort
+ MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1};
+ MrgSort(params.buffLocal[4],
+ {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, localParams);
+ PipeBarrier();
+ GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]);
+ const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3];
+ // CopyOut
+ DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
+ {1, static_cast(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0});
+ PipeBarrier();
+ // renew currentHead, restLen
+ totalMrgLen += localMrgLen;
+ for (int i = 0; i < queNum; i++) {
+ restLen[i] -= sortedLen[i];
+ currentHead[i] += sortedLen[i];
+ }
+ // Switch empty to tail
+ for (int i = 0; i < queNum; i++) {
+ if (restLen[i] == 0) {
+ for (int j = i; j < 3; j++) {
+ restLen[j] = restLen[j + 1];
+ currentHead[j] = currentHead[j + 1];
+ srcList[j] = srcList[j + 1];
+ }
+ restLen[3] = 0;
+ queNum--;
+ break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que.
+ }
+ }
+ }
+ // Process tail
+ for (int i = 0; i < params.queNum; i++) {
+ if (restLen[i] > 0) {
+ DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
+ srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
+ restLen[i] * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
+ break;
+ }
+ }
+};
+
+template
+__aicore__ inline void KernelUnique::BlockSortV2()
+{
+ LocalTensor sortedLocal1 = calcBuf[0].AllocTensor();
+ LocalTensor sortedLocal2 = calcBuf[1].AllocTensor();
+ LocalTensor mrgLocal = calcBuf[2].AllocTensor();
+ GlobalTensor sortedBlock[2] = {sortedBlock1, sortedBlock2};
+
+ // Each time merge 4 queues into 1 queue.
+ constexpr uint8_t PREFIX_QUE_NUM = 4;
+ bool switchFlag = false;
+ GlobalTensor srcGlobal[4];
+ LocalTensor buffLocal[5];
+ int lengths[4];
+ for (int bindTile = 1; bindTile < tileNum; bindTile *= PREFIX_QUE_NUM) {
+ for (int tileIdx = 0; tileIdx < tileNum; tileIdx += bindTile * PREFIX_QUE_NUM) {
+ int mrgTileNum = min(tileNum - tileIdx, bindTile * PREFIX_QUE_NUM);
+ uint8_t queNum = (mrgTileNum + bindTile - 1) / bindTile;
+ uint8_t lastQueTileNum = mrgTileNum % bindTile;
+ if (lastQueTileNum == 0) {
+ lastQueTileNum = bindTile;
+ }
+ // Init GMSSrcList, GMSParams
+ for (int i = 0; i < queNum; i++) {
+ srcGlobal[i] = sortedBlock[switchFlag][TILE_LEN_ELEM * (tileIdx + bindTile * i)];
+ }
+ for (int i = 0; i < queNum - 1; i++) {
+ lengths[i] = TILE_LENGTH * bindTile;
+ }
+ lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum;
+ GMSSrcList srcList {srcGlobal};
+ GMSParams params {lengths, queNum,
+ {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
+ MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params);
+ }
+ switchFlag = !switchFlag;
+ }
+ if (switchFlag) {
+ DataCopyGM2GM(sortedBlock1, sortedBlock2, sortedLocal1, blockLength * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
+ }
+ calcBuf[0].FreeTensor(sortedLocal1);
+ calcBuf[1].FreeTensor(sortedLocal2);
+ calcBuf[2].FreeTensor(mrgLocal);
+}
+
+template
+__aicore__ inline void KernelUnique::GlobalSortV2()
+{
+ LocalTensor sortedLocal1 = calcBuf[0].AllocTensor();
+ LocalTensor sortedLocal2 = calcBuf[1].AllocTensor();
+ LocalTensor mrgLocal = calcBuf[2].AllocTensor();
+ LocalTensor IBSyncLocal = sortedLocal2.ReinterpretCast();
+ GlobalTensor sortedGlobal[2] = {sortedGlobal1, sortedGlobal2};
+
+ // Each time merge up to 4 queues into 1 queue.
+ constexpr uint8_t PREFIX_QUE_NUM = 4;
+ bool switchFlag = false;
+ GlobalTensor srcGlobal[4];
+ int lengths[4];
+ for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) {
+ for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) {
+ if ((GetBlockIdx() == blockIdx + bindBlock) || (GetBlockIdx() == blockIdx + bindBlock * 2) ||
+ (GetBlockIdx() == blockIdx + bindBlock * 3)) {
+ PipeBarrier();
+ IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
+ PipeBarrier();
+ } else if (GetBlockIdx() == blockIdx) {
+ int mrgBlockNum = min(blockNum - blockIdx, bindBlock * PREFIX_QUE_NUM);
+ uint8_t queNum = (mrgBlockNum + bindBlock - 1) / bindBlock;
+ for (int i = 1; i < queNum; i++) {
+ PipeBarrier();
+ IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)blockIdx + (bindBlock * i), eventID);
+ PipeBarrier();
+ }
+ // 判断最后一个队列包含了多少个block的数据.
+ uint8_t lastQueBlockNum = mrgBlockNum % bindBlock;
+ if (lastQueBlockNum == 0) {
+ lastQueBlockNum = bindBlock;
+ }
+ // Init GMSSrcList, GMSParams
+ for (int i = 0; i < queNum; i++) {
+ srcGlobal[i] =
+ sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * SORT_DATATYPE_SIZE_FACTOR];
+ }
+ for (int i = 0; i < queNum - 1; i++) {
+ lengths[i] =
+ GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - GetGlobalOffset(blockIdx + (bindBlock * i));
+ }
+ lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) -
+ GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)));
+ GMSSrcList srcList {srcGlobal};
+ GMSParams params {lengths, queNum,
+ {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
+ MrgSortGM(
+ sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], srcList, params);
+ }
+ }
+ switchFlag = !switchFlag;
+ }
+ // Switch valid workspace pointer.
+ if (switchFlag) {
+ GlobalTensor tmpGlobal = sortedGlobal1;
+ sortedGlobal1 = sortedGlobal2;
+ sortedGlobal2 = tmpGlobal;
+
+ GlobalTensor tmpGlobal1 = sortedBlock1;
+ sortedBlock1 = sortedBlock2;
+ sortedBlock2 = tmpGlobal1;
+
+ GlobalTensor tmpGlobal2 = sortedBlock1AsInt;
+ sortedBlock1AsInt = sortedBlock2AsInt;
+ sortedBlock2AsInt = tmpGlobal2;
+ }
+ calcBuf[0].FreeTensor(sortedLocal1);
+ calcBuf[1].FreeTensor(sortedLocal2);
+ calcBuf[2].FreeTensor(mrgLocal);
+}
+
+template
+__aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor& dstVal,
+ const LocalTensor& srcLocal, const LocalTensor& shiftedLocal, const LocalTensor& bitMask32,
+ const uint16_t elemLength, uint64_t& tileUniqueCnt)
+{
+ LocalTensor bitMask16 = bitMask32.ReinterpretCast();
+ uint64_t rsvdCnt = 0;
+ // Seperate Val and Idx.
+ GatherMask(dstVal, srcLocal, 1, false, 0, {1, static_cast((elemLength * 2 + 63) / 64), 8, 0}, rsvdCnt);
+ PipeBarrier();
+
+ // Gen bitMask to calc shifted array.
+ Duplicate(bitMask16, (uint16_t)0b1111111111111111, elemLength / 16);
+ PipeBarrier();
+ bitMask16.SetValue(0, 0b1111111111111110);
+
+ // Calc shifted array.
+ GatherMask(shiftedLocal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, rsvdCnt);
+ PipeBarrier();
+ // Set the last val as INF in order to avoid dropping the last unique val.
+ shiftedLocal.SetValue(elemLength - 1, -FLOAT_INF);
+
+ // Generate bitMask which represents unique numbers.
+ Compare(bitMask16, dstVal, shiftedLocal, CMPMODE::NE, (elemLength + 63) / 64 * 64);
+ PipeBarrier();
+
+ // Gather unique numbers and their idx.
+ GatherMask(dstVal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, tileUniqueCnt);
+ PipeBarrier();
+}
+
+template
+__aicore__ inline void KernelUnique::TileUnique(const int32_t progress)
+{
+ LocalTensor bitMask32 = calcBuf[0].AllocTensor();
+ LocalTensor shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast();
+ LocalTensor sortedLocal1 = calcBuf[1].AllocTensor();
+ LocalTensor sortedLocal2 = calcBuf[2].AllocTensor();
+ LocalTensor uniqueCntLocal = shiftedLocal.ReinterpretCast();
+ uint64_t tileUniqueCnt;
+ uint64_t tmpRsvdCnt;
+
+ DataCopy(sortedLocal1, sortedBlock1[progress * TILE_LEN_ELEM], TILE_LEN_ELEM);
+ PipeBarrier();
+
+ ConsecutiveUnique(sortedLocal2, sortedLocal1, shiftedLocal, bitMask32, TILE_LENGTH, tileUniqueCnt);
+ // If has inf, append.
+ if ((progress == tileNum - 1) && hasInfFlag) {
+ sortedLocal2.SetValue(tileUniqueCnt, -FLOAT_INF);
+ tileUniqueCnt++;
+ }
+
+ if (tileUniqueCnt != 0) {
+ accUniqueCnt += tileUniqueCnt;
+ if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) {
+ accUniqueCnt--;
+ }
+ DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], sortedLocal2,
+ {1, static_cast(sizeof(float) * tileUniqueCnt), 0, 0});
+ PipeBarrier();
+ lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1);
+ }
+
+ // upload uniqueCnt.
+ if (progress == tileNum - 1) {
+ uniqueCntLocal.SetValue(0, accUniqueCnt);
+ DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], uniqueCntLocal,
+ {1, static_cast(sizeof(uint32_t) * 1), 0, 0});
+ PipeBarrier();
+ }
+ calcBuf[0].FreeTensor(shiftedLocal);
+ calcBuf[1].FreeTensor(sortedLocal1);
+ calcBuf[2].FreeTensor(sortedLocal2);
+}
+
+template
+__aicore__ inline void KernelUnique::CopyOut()
+{
+ LocalTensor copyLocal0 = calcBuf[0].AllocTensor();
+ LocalTensor copyLocal1 = calcBuf[1].AllocTensor();
+ LocalTensor IBSyncLocal = copyLocal1.ReinterpretCast();
+ LocalTensor copyLocal2 = calcBuf[2].AllocTensor();
+
+ uint64_t lastAccUniqueCnt = 0;
+ // Get every blockUniqueCnt before current block. Calc accumulate uniqueCnt.
+ for (int i = 0; i < GetBlockIdx(); i++) {
+ uint64_t lastUniqueCnt = blockUniqueCntGlobal.GetValue(i);
+ lastAccUniqueCnt += lastUniqueCnt;
+ // If the first val of (i+1)th block equals to the last val of (i)th block, then they should be placed in
+ // the same position, accUniqueCnt--.
+ if (sortedGlobal1[GetGlobalOffset(i + 1) * SORT_DATATYPE_SIZE_FACTOR].GetValue(0) ==
+ sortedGlobal1[GetGlobalOffset(i) * SORT_DATATYPE_SIZE_FACTOR].GetValue(lastUniqueCnt - 1)) {
+ lastAccUniqueCnt--;
+ }
+ }
+ uint64_t thisUniqueCnt = blockUniqueCntGlobal.GetValue(GetBlockIdx());
+
+ uint64_t restLen = thisUniqueCnt;
+ // max(Ta a, Tb b) function does not support compilation period calc.
+ constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float) ? sizeof(T) : sizeof(float);
+ LocalTensor copyVal32 = copyLocal0.template ReinterpretCast();
+ LocalTensor uniqueVal32 = copyLocal1.ReinterpretCast();
+ // Copy unique values (and counts) from Workspace to dst.
+ while (restLen > 0) {
+ // DataCopyPad could copy up to 65535B in one cycle. And one tile may contain up to 65536B. So we should
+ // process multiple cycles.
+ uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize);
+ copyLen = min(copyLen, 65535 / bottleneckTypeSize);
+ if constexpr (!IsSameType::value) {
+ DataCopyPad(copyLocal1, sortedBlock1[thisUniqueCnt - restLen],
+ {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
+ PipeBarrier();
+ Muls(copyLocal1, copyLocal1, (float)-1, copyLen);
+ PipeBarrier();
+ Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen);
+ PipeBarrier();
+ } else {
+ DataCopyPad(copyLocal0, sortedBlock1[thisUniqueCnt - restLen],
+ {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
+ PipeBarrier();
+ Muls(copyLocal0, copyLocal0, (float)-1, copyLen);
+ PipeBarrier();
+ }
+ // DataCopyPad does not support int64_t. Copy them as uint32_t.
+ if constexpr (sizeof(T) > 4) {
+ DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)],
+ copyVal32, {1, static_cast(sizeof(T) * copyLen), 0, 0});
+ } else {
+ DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], copyLocal0,
+ {1, static_cast(sizeof(T) * copyLen), 0, 0});
+ }
+ PipeBarrier();
+ restLen -= copyLen;
+ }
+ // Return unique count.
+ if (GetBlockIdx() == blockNum - 1) {
+ uniqueVal32.SetValue(0, lastAccUniqueCnt + thisUniqueCnt);
+ DataCopyPad(uniqueCntGlobal, uniqueVal32, {1, static_cast(sizeof(uint32_t) * 1), 0, 0});
+ PipeBarrier();
+ }
+ calcBuf[0].FreeTensor(copyLocal0);
+ calcBuf[1].FreeTensor(copyLocal1);
+}
+} // namespace AscendC
diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py
index fe90e0168c38da55137933b563525e502b0cb354..f28b456265ee0811d8794f087bdf995bc16a0b20 100644
--- a/mx_driving/__init__.py
+++ b/mx_driving/__init__.py
@@ -61,6 +61,7 @@ __all__ = [
"diff_iou_rotated_2d",
"nms3d_on_sight",
"cartesian_to_frenet",
+ "npu_unique",
"min_area_polygons"
]
@@ -125,6 +126,7 @@ from .ops.npu_batch_matmul import npu_batch_matmul
from .ops.nms3d_on_sight import nms3d_on_sight
from .ops.cartesian_to_frenet import cartesian_to_frenet
from .patcher import default_patcher_builder, patch_mmcv_version
+from .ops.npu_unique import npu_unique
from .ops.min_area_polygons import min_area_polygons
diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..314f41a0f8bc629894d164c48586ceaefbef7aa1
--- /dev/null
+++ b/mx_driving/csrc/Unique.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
+
+
+at::Tensor npu_unique(const at::Tensor& input)
+{
+ TORCH_CHECK_NPU(voxels);
+ if (input.numel() < 2) {
+ at::Tensor output = at::Tensor(input).clone();
+ return output;
+ } else {
+ at::Tensor output = at::empty({input.numel()}, at::TensorOptions().dtype(input.dtype()).device(input.device()));
+ at::Tensor uniqueCnt = at::empty({1}, at::TensorOptions().dtype(at::ScalarType::Int).device(input.device()));
+ EXEC_NPU_CMD_SYNC(aclnnUnique, input, output, uniqueCnt);
+ int uniqueCount = uniqueCnt.item();
+ return output.narrow(0, 0, uniqueCount);
+ }
+}
diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp
index 8e20d8a40c85d5f075c405da9ee15fc17ba2d500..ed8ad6ccf40f29c737ac248b52ffaecc58c7f7f7 100644
--- a/mx_driving/csrc/pybind.cpp
+++ b/mx_driving/csrc/pybind.cpp
@@ -242,6 +242,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
// npu_subm_sparse_conv3d_with_key
m.def("npu_subm_sparse_conv3d_with_key", &npu_subm_sparse_conv3d_with_key);
+
+ // npu_unique
+ m.def("npu_unique", &npu_unique);
// min_area_polygons
m.def("min_area_polygons", &min_area_polygons);
diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd367b3535d527867313280501fae00da258933
--- /dev/null
+++ b/mx_driving/ops/npu_unique.py
@@ -0,0 +1,21 @@
+"""
+Copyright (c) OpenMMLab. All rights reserved.
+Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+Modification by: Huawei Developers
+Modification date: 2024-06-04
+Modification Description:
+Modification 1. Add support for Ascend NPU
+"""
+from torch.autograd import Function
+import mx_driving._C
+
+
+class UniqueFunction(Function):
+ @staticmethod
+ # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+ def forward(ctx, input_tensor):
+ y = mx_driving._C.npu_unique(input_tensor)
+ return y
+
+
+npu_unique = UniqueFunction.apply
diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..d914475294f02a91ce9068c21d09e45fbd7ba003
--- /dev/null
+++ b/tests/torch/test_npu_unique.py
@@ -0,0 +1,68 @@
+"""
+Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+"""
+import random
+import os
+import torch
+import torch_npu
+from data_cache import golden_data_cache
+from torch_npu.testing.testcase import TestCase, run_tests
+from mx_driving import npu_unique
+
+
+def gen_inputs(input_shape, dtype):
+ input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype)
+ return input_tensor
+
+
+def gen_cpu_outputs(input_tensor):
+ cpu_result = torch.unique(input_tensor)
+ return cpu_result
+
+
+def gen_npu_outputs(input_tensor):
+ npu_result = npu_unique(input_tensor.npu())
+ return npu_result.cpu()
+
+
+class TestNpuUnique(TestCase):
+ def test_bfloat16(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.bfloat16)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().to(torch.float32).numpy(),
+ npu_result.cpu().detach().to(torch.float32).numpy())
+
+ def test_float16(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float16)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+ def test_float32(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float32)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+ def test_int16(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int16)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+ def test_int32(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int32)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+ def test_int64(self, device='npu'):
+ input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int64)
+ cpu_result = gen_cpu_outputs(input_tensor)
+ npu_result = gen_npu_outputs(input_tensor)
+ self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+
+if __name__ == "__main__":
+ run_tests()