diff --git a/docs/api/README.md b/docs/api/README.md index d335b44ed1298c295ad053e04c7e3a35e97c8b57..5abc02091d6418491d2b10426abeea56424e4bac 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -54,7 +54,7 @@ Y - 采样 + 采样 roipoint_pool3d Y @@ -94,6 +94,10 @@ grid_sampler2d_v2 N + + npu_unique + N + 体素化 voxelization diff --git a/docs/api/context/npu_unique.md b/docs/api/context/npu_unique.md new file mode 100644 index 0000000000000000000000000000000000000000..843be385f4622ca7b453e2413f3b2c8fcca023e8 --- /dev/null +++ b/docs/api/context/npu_unique.md @@ -0,0 +1,37 @@ +## npu_unique[beta] + +### 接口原型 + +```python +mx_driving.npu_unique(Tensor input) -> Tensor +``` + +### 功能描述 + +从小到大排序并去重. 提供一个输入`tensor`, 对`tensor`的输入进行排序, 并去掉`tensor`中的重复元素. + +### 参数说明 + +- `input(Tensor)`:表示输入张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`. shape 为 1 ~ 8 维的任意shape. + +### 返回值 + +- `output(Tensor)`:表示输出张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`, 与输入张量`input`一致. shape 为 1 维。 + +### 约束说明 + +- int32, int64输入时, 每个元素的值须在[-16777216, 16777216] (±2^24)之间,否则会引入精度损失. + +### 支持的型号 + +- Atlas A2 训练系列产品 + +### 调用示例 + +```python +import torch, torch_npu +from mx_driving import npu_unique + +rand_tensor = torch.rand(559794, dtype=torch.int64) +output = npu_unique(rand_tensor.npu()) +``` diff --git a/include/csrc/functions.h b/include/csrc/functions.h index bdc71908ac93a3cc5ca4493b65a108a69eb1da45..e2d81d8052c4c60e792330feef01fe83cafbae8e 100644 --- a/include/csrc/functions.h +++ b/include/csrc/functions.h @@ -288,6 +288,7 @@ std::tuple calc_poly_start_end_sl(const at:: at::Tensor npu_subm_sparse_conv3d_with_key(const at::Tensor& ouidx_offset, const at::Tensor& valid_indices, const at::Tensor& weight, const at::Tensor& feature, int indices_number, at::IntArrayRef kernel_size); +at::Tensor npu_unique(const at::Tensor& input); at::Tensor min_area_polygons(const at::Tensor& pointsets); #endif // CSRC_FUNCTIONS_H_ diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bc04d9e1fbd55abaf4db798decae5eb5bdfbe068 --- /dev/null +++ b/kernels/op_host/unique.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + * + */ +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" +#include "unique_tiling.h" +constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024; + + +namespace optiling { +static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) +{ + if (!context) { + return ge::GRAPH_FAILED; + } + UniqueTilingData tiling; + + constexpr uint16_t tileLength = 8192; + const gert::StorageShape* inputShape = context->GetInputShape(0); + if (!inputShape) { + return ge::GRAPH_FAILED; + } + const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); + uint32_t totalLength = 1; + for (int i = 0; i < dimNum; i++) { + totalLength *= inputShape->GetStorageShape().GetDim(i); + } + const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength; + const uint16_t tailLength = totalLength % tileLength; + const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv(); + const uint8_t blockNum = tileNum >= aivNum ? aivNum : tileNum; + const uint32_t shortBlockTileNum = tileNum / blockNum; + const uint8_t longBlockNum = tileNum % blockNum; + const uint8_t shortBlockNum = blockNum - longBlockNum; + + tiling.set_totalLength(totalLength); + tiling.set_tileNum(tileNum); + tiling.set_shortBlockTileNum(shortBlockTileNum); + tiling.set_tailLength(tailLength); + tiling.set_blockNum(blockNum); + tiling.set_shortBlockNum(shortBlockNum); + + context->SetBlockDim(blockNum); + if (context->GetRawTilingData() == nullptr) { + return ge::GRAPH_FAILED; + } + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + // Workspace for IBSet/IBWait up to 8 times, and 2 times full data. + uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + auto&& currentWorkspace = context->GetWorkspaceSizes(1); + if (currentWorkspace == nullptr) { + return ge::GRAPH_FAILED; + } + size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2; + currentWorkspace[0] = usrSize + sysWorkspaceSize; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + + +namespace ge { +static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) +{ + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + if (!x1_shape || !y_shape) { + return GRAPH_FAILED; + } + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context) +{ + auto inputDtype = context->GetInputDataType(0); + context->SetOutputDataType(0, inputDtype); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + + +namespace ops { +class Unique : public OpDef { +public: + explicit Unique(const char* name) : OpDef(name) + { + this->Input("input") + .ParamType(REQUIRED) + .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .IgnoreContiguous(); + this->Output("output") + .ParamType(REQUIRED) + .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("uniqueCnt") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::UniqueInferShape); + this->SetInferDataType(ge::UniqueInferDtype); + + this->AICore().SetTiling(optiling::UniqueTilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910_93"); + } +}; + +OP_ADD(Unique); +} // namespace ops diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..aed40f6f6f9895b83637ac2e3920a328e9c0303f --- /dev/null +++ b/kernels/op_host/unique_tiling.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + * + */ +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(UniqueTilingData) + TILING_DATA_FIELD_DEF(uint32_t, totalLength); + TILING_DATA_FIELD_DEF(uint32_t, tileNum); + TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum); + TILING_DATA_FIELD_DEF(uint16_t, tailLength); + TILING_DATA_FIELD_DEF(uint8_t, blockNum); + TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData) +} diff --git a/kernels/op_kernel/unique.cpp b/kernels/op_kernel/unique.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e9cedcf5172f6cbbe5332fdbe9e5f15c10e506f0 --- /dev/null +++ b/kernels/op_kernel/unique.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include "kernel_operator.h" +#include "unique.h" + + +extern "C" __global__ __aicore__ void unique( + GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, GM_ADDR tiling) { + GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace); + GET_TILING_DATA(tiling_data, tiling); + TPipe pipe; + KernelUnique op(pipe); + op.Init(input, + output, + uniqueCnt, + usrWorkspace, + tiling_data.totalLength, + tiling_data.tileNum, + tiling_data.shortBlockTileNum, + tiling_data.tailLength, + tiling_data.blockNum, + tiling_data.shortBlockNum); + op.Process(); +} diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h new file mode 100644 index 0000000000000000000000000000000000000000..9470d73b2bda7fe62f29a8feb246eba94f55e834 --- /dev/null +++ b/kernels/op_kernel/unique.h @@ -0,0 +1,698 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + */ + +#include "kernel_operator.h" +using namespace AscendC; + + +namespace AscendC { +template +__aicore__ inline Ta min(const Ta a, const Tb b) +{ + if (a > b) { + return b; + } + return a; +} + +template +__aicore__ inline Ta max(const Ta a, const Tb b) +{ + if (a < b) { + return b; + } + return a; +} + +template +class KernelUnique { +public: + __aicore__ inline KernelUnique(TPipe& pipe) : pipe(pipe) {} + // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. + __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx); + __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, + const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum, + const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum); + __aicore__ inline void Process(); + +private: + __aicore__ inline void CopyIn(const int32_t progress); + __aicore__ inline void Elem32Sort(const int32_t progress); + __aicore__ inline void TileSort(const int32_t progress); + template + __aicore__ inline static void DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src, + const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength); + using GMSSrcList = GlobalTensor (&)[4]; + struct GMSParams { + int (&GMSLengths)[4]; + uint8_t& queNum; + LocalTensor (&&buffLocal)[5]; + }; + __aicore__ inline static void MrgSortGM(GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params); + __aicore__ inline void BlockSortV2(); + __aicore__ inline void GlobalSortV2(); + __aicore__ inline static void ConsecutiveUnique(const LocalTensor& dstVal, + const LocalTensor& srcLocal, const LocalTensor& shiftedLocal, + const LocalTensor& bitMask16, const uint16_t elemLength, uint64_t& tileUniqueCnt); + __aicore__ inline void TileUnique(const int32_t progress); + __aicore__ inline void CopyOut(); + +private: + static constexpr int32_t TILE_LENGTH = 8192; + // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique. + static constexpr float FLOAT_INF = 3e+99; + // Indicates the factor converting float to data structure used by Sort32&MrgSort. + static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t); // 8 + static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2 + static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE; // 8192 * 8 = 65536 + static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR; // 8192 * 2 = 16384 + static constexpr uint16_t VALID_QUE[5] = { + 0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort. + + TPipe& pipe; + TQue calcBuf[3]; + + GlobalTensor srcGlobal; + GlobalTensor srcGlobalAsUint; + GlobalTensor dstGlobal1; + GlobalTensor dstGlobal1As32; + GlobalTensor uniqueCntGlobal; + + GlobalTensor sortedBlock1; + GlobalTensor sortedBlock1AsInt; + GlobalTensor sortedBlock2; + GlobalTensor sortedBlock2AsInt; + GlobalTensor sortedGlobal1; + GlobalTensor sortedGlobal2; + + GlobalTensor IBSyncGlobal; + GlobalTensor blockUniqueCntGlobal; + + uint16_t syncWorkspaceSize; + uint8_t eventID {0}; + uint64_t accUniqueCnt {0}; + float lastTileUniqueVal; + + uint32_t totalLength; + uint32_t alignedTotalLength; + uint32_t tileNum; + uint32_t shortBlockTileNum; + uint16_t tailLength; + uint8_t blockNum; + uint8_t shortBlockNum; + + size_t globalOffset; // Offset of data for current block. + size_t blockLength; // Length of current block. + bool hasInfFlag {false}; +}; + +// Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. +template +__aicore__ inline size_t KernelUnique::GetGlobalOffset(const uint32_t blockIdx) +{ + // (shortBlockTileNum + 1) indicates longBlockTileNum. + const size_t offset = + (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) + + (this->shortBlockTileNum + 1) * (this->shortBlockNum >= blockIdx ? 0 : blockIdx - this->shortBlockNum)) * + TILE_LENGTH; + return offset; +} + +template +__aicore__ inline void KernelUnique::Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, + const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum, + const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum) +{ + this->totalLength = totalLength; + this->alignedTotalLength = totalTileNum * TILE_LENGTH; + this->shortBlockTileNum = shortBlockTileNum; + this->tailLength = tailLength; + this->blockNum = blockNum; + this->shortBlockNum = shortBlockNum; + + const bool isShortBlock = this->shortBlockNum > GetBlockIdx(); + // (shortBlockTileNum + 1) indicates longBlockTileNum. + this->tileNum = isShortBlock ? shortBlockTileNum : shortBlockTileNum + 1; + this->blockLength = this->tileNum * TILE_LENGTH; + this->globalOffset = GetGlobalOffset(GetBlockIdx()); + + srcGlobal.SetGlobalBuffer((__gm__ T*)input + globalOffset, this->blockLength); + srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t*)input + globalOffset * sizeof(T) / sizeof(uint32_t), + this->blockLength * sizeof(T) / sizeof(uint32_t)); + dstGlobal1.SetGlobalBuffer((__gm__ T*)output, this->alignedTotalLength); + dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t*)output, this->alignedTotalLength * sizeof(T) / sizeof(int32_t)); + uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t*)uniqueCnt, 1); + + // sortedBlock is offsetted, and could only see the data that this block should process. + sortedBlock1.SetGlobalBuffer((__gm__ float*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + // sortedGlobal could see all data in the workspace. + sortedGlobal1.SetGlobalBuffer((__gm__ float*)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + sortedGlobal2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR, + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + + // Buff size for syncronizing according to document of IBWait&IBSet. + this->syncWorkspaceSize = (blockNum * 32 + 1) * 8; + IBSyncGlobal.SetGlobalBuffer( + (__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, syncWorkspaceSize); + blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t*)workspace + alignedTotalLength * 4 + syncWorkspaceSize, + (blockNum + 7) / 8 * 8); // Length aligned up to 32B. + + pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE); + pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE); + pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE); +} + +template +__aicore__ inline void KernelUnique::Process() +{ + LocalTensor IBSyncLocal; + // Initialize sync buff. + if (GetBlockIdx() == 0) { + IBSyncLocal = calcBuf[0].AllocTensor(); + Duplicate(IBSyncLocal, 0, syncWorkspaceSize); + PipeBarrier(); + DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize); + PipeBarrier(); + calcBuf[0].FreeTensor(IBSyncLocal); + } // Initialize sync buff. + + // Sort within each tile. + for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) { + CopyIn(tileIdx); + Elem32Sort(tileIdx); + TileSort(tileIdx); + } + + if (GetBlockNum() > 1) { + if (this->tileNum > 1) { + BlockSortV2(); // Sort within each block. + } + + GlobalSortV2(); // Sort globally. + + PipeBarrier(); + SyncAll(); + PipeBarrier(); + } + + // Check if an inf value exists. If do, inf will be append to the result in TileUnique(). + if ((IsSameType::value || IsSameType::value || IsSameType::value) && + GetBlockIdx() == blockNum - 1) { + PipeBarrier(); + const uint32_t totalLength32BAligned = (totalLength * sizeof(T) + 31) / 32 * 32 / sizeof(float); + if (sortedGlobal1.GetValue((totalLength32BAligned - 1) * 2) == -FLOAT_INF) { + hasInfFlag = true; + } + } + + // Do unique in each block based on tiles. + for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) { + TileUnique(tileIdx); + } + + if (this->blockNum > 1) { + // Each block waits for its former block to upload blockUniqueCnt. + IBSyncLocal = calcBuf[0].AllocTensor(); + if (GetBlockIdx() != 0) { + PipeBarrier(); + IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx() - 1, eventID); + PipeBarrier(); + } + PipeBarrier(); + IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID); + PipeBarrier(); + calcBuf[0].FreeTensor(IBSyncLocal); + } + + // Gather result from every block. + CopyOut(); +} + +template +__aicore__ inline void KernelUnique::CopyIn(const int32_t progress) +{ + LocalTensor srcLocal = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].AllocTensor(); + + // To process tail, fill the whole tile with INF, then cover it with tail. + int32_t castLen; // Valid length of the last block. + if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) { + // Must determine during compilation, otherwise we get a compilation error. + if constexpr (!IsSameType::value) { + DataCopy(srcLocal, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH); + } else { + DataCopy(sortedLocal2, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH); + } + castLen = TILE_LENGTH; + } else { + // Process tail. + LocalTensor srcAsUint = srcLocal.template ReinterpretCast(); + Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH); + PipeBarrier(); + if constexpr (IsSameType::value) { + DataCopyPad(sortedLocal2, srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); + } else if constexpr (sizeof(T) >= sizeof(float)) { + DataCopyPad(srcAsUint, srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); + } else { + DataCopyPad(srcLocal, srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); + } + castLen = tailLength; + } + PipeBarrier(); + if constexpr (!IsSameType::value) { + if constexpr (sizeof(T) >= sizeof(float)) { + Cast(sortedLocal2, srcLocal, RoundMode::CAST_ROUND, castLen); + } else { + Cast(sortedLocal2, srcLocal, RoundMode::CAST_NONE, castLen); + } + PipeBarrier(); + } + Muls(sortedLocal2, sortedLocal2, (float)-1, TILE_LENGTH); + calcBuf[0].EnQue(srcLocal); + calcBuf[2].EnQue(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) +{ + LocalTensor srcLocal = calcBuf[0].DeQue(); + LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].DeQue(); + LocalTensor arithLocal = srcLocal.template ReinterpretCast()[TILE_LENGTH]; + + int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset + Duplicate(arithLocal, baseOffset, TILE_LENGTH); + PipeBarrier(); + + LocalTensor uidArray = arithLocal.template ReinterpretCast(); + // Max repeatTime of Sort32 is 255, which is exceeded because TILE_LENGTH is 8192. + constexpr uint8_t sort32BatchSize = 32; + constexpr uint8_t sort32RepeatLimit = 255; + int instrRepeatTime = 0; + int restLen = TILE_LENGTH; + while (restLen) { + int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit); + Sort32(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime], + sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], + uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], repTime); + restLen -= repTime * sort32BatchSize; + instrRepeatTime++; + } + PipeBarrier(); + calcBuf[0].FreeTensor(srcLocal); + calcBuf[1].EnQue(sortedLocal1); + calcBuf[2].EnQue(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::TileSort(const int32_t progress) +{ + LocalTensor sortedLocal1 = calcBuf[1].DeQue(); + LocalTensor sortedLocal2 = calcBuf[2].DeQue(); + LocalTensor sortedQue[2] = {sortedLocal1, sortedLocal2}; + uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32. + uint16_t currentQueNum = TILE_LENGTH / currentQueLength; + bool switchFlag = false; + // Multiple MrgSort until we have one generally sorted tile. + while (currentQueLength < TILE_LENGTH) { + const uint16_t elementLengths[4] = {currentQueLength, currentQueLength, currentQueLength, currentQueLength}; + const uint16_t fullMrgSortTime = currentQueNum / 4; + if (fullMrgSortTime > 0) { + MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime}; + MrgSort(sortedQue[!switchFlag], + {sortedQue[switchFlag][0], sortedQue[switchFlag][currentQueLength * 1 * 2], + sortedQue[switchFlag][currentQueLength * 2 * 2], sortedQue[switchFlag][currentQueLength * 3 * 2]}, + params); + PipeBarrier(); + switchFlag = !switchFlag; + } + currentQueNum = fullMrgSortTime; + currentQueLength *= 4; + } + DataCopy(sortedBlock1[progress * TILE_LEN_ELEM], sortedQue[switchFlag], TILE_LEN_ELEM); + PipeBarrier(); + calcBuf[1].FreeTensor(sortedLocal1); + calcBuf[2].FreeTensor(sortedLocal2); +} + +template +template +__aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src, + const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength) +{ + // Max byte size of DataCopyPad in one repeat is 65535. + int bufElemLength = min(bufByteLength, 65535) / sizeof(T1); + int restLen = elemLength; + while (restLen > 0) { + int copyLen = min(restLen, bufElemLength); + DataCopyPad(tmpLocal, src[elemLength - restLen], {1, static_cast(sizeof(T1) * copyLen), 0, 0}, + {false, 0, 0, 0}); + PipeBarrier(); + DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast(sizeof(T1) * copyLen), 0, 0}); + PipeBarrier(); + restLen -= copyLen; + } +} + +template +__aicore__ inline void KernelUnique::MrgSortGM( + GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params) +{ + int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]}; + int currentHead[4] {}; + int totalMrgLen {}; + uint8_t queNum = params.queNum; + // limited by MrgSort api constraint and mrgLocal size, we set different buffer length due to diffent queNum. + // mrgLocal contains 8192 elems, and MrgSort limits max 4095 elems per queue. + constexpr int BUFFER_LEN[5] {0, 0, 4095, 2730, 2048}; + uint16_t sortedLen[4]; + uint16_t mrgLen[4] {}; + while (queNum > 1) { + int currentBufferLen = BUFFER_LEN[queNum]; + for (int i = 0; i < queNum; i++) { + mrgLen[i] = min(restLen[i], currentBufferLen); + } + // CopyIn + for (int i = 0; i < queNum; i++) { + DataCopyPad(params.buffLocal[i], srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], + {1, static_cast(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0}, + {false, 0, 0, 0}); + } + PipeBarrier(); + // MrgSort + MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1}; + MrgSort(params.buffLocal[4], + {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, localParams); + PipeBarrier(); + GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]); + const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3]; + // CopyOut + DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4], + {1, static_cast(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0}); + PipeBarrier(); + // renew currentHead, restLen + totalMrgLen += localMrgLen; + for (int i = 0; i < queNum; i++) { + restLen[i] -= sortedLen[i]; + currentHead[i] += sortedLen[i]; + } + // Switch empty to tail + for (int i = 0; i < queNum; i++) { + if (restLen[i] == 0) { + for (int j = i; j < 3; j++) { + restLen[j] = restLen[j + 1]; + currentHead[j] = currentHead[j + 1]; + srcList[j] = srcList[j + 1]; + } + restLen[3] = 0; + queNum--; + break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que. + } + } + } + // Process tail + for (int i = 0; i < params.queNum; i++) { + if (restLen[i] > 0) { + DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], + srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4], + restLen[i] * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE); + break; + } + } +}; + +template +__aicore__ inline void KernelUnique::BlockSortV2() +{ + LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); + LocalTensor mrgLocal = calcBuf[2].AllocTensor(); + GlobalTensor sortedBlock[2] = {sortedBlock1, sortedBlock2}; + + // Each time merge 4 queues into 1 queue. + constexpr uint8_t PREFIX_QUE_NUM = 4; + bool switchFlag = false; + GlobalTensor srcGlobal[4]; + LocalTensor buffLocal[5]; + int lengths[4]; + for (int bindTile = 1; bindTile < tileNum; bindTile *= PREFIX_QUE_NUM) { + for (int tileIdx = 0; tileIdx < tileNum; tileIdx += bindTile * PREFIX_QUE_NUM) { + int mrgTileNum = min(tileNum - tileIdx, bindTile * PREFIX_QUE_NUM); + uint8_t queNum = (mrgTileNum + bindTile - 1) / bindTile; + uint8_t lastQueTileNum = mrgTileNum % bindTile; + if (lastQueTileNum == 0) { + lastQueTileNum = bindTile; + } + // Init GMSSrcList, GMSParams + for (int i = 0; i < queNum; i++) { + srcGlobal[i] = sortedBlock[switchFlag][TILE_LEN_ELEM * (tileIdx + bindTile * i)]; + } + for (int i = 0; i < queNum - 1; i++) { + lengths[i] = TILE_LENGTH * bindTile; + } + lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum; + GMSSrcList srcList {srcGlobal}; + GMSParams params {lengths, queNum, + {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}}; + MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params); + } + switchFlag = !switchFlag; + } + if (switchFlag) { + DataCopyGM2GM(sortedBlock1, sortedBlock2, sortedLocal1, blockLength * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE); + } + calcBuf[0].FreeTensor(sortedLocal1); + calcBuf[1].FreeTensor(sortedLocal2); + calcBuf[2].FreeTensor(mrgLocal); +} + +template +__aicore__ inline void KernelUnique::GlobalSortV2() +{ + LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); + LocalTensor mrgLocal = calcBuf[2].AllocTensor(); + LocalTensor IBSyncLocal = sortedLocal2.ReinterpretCast(); + GlobalTensor sortedGlobal[2] = {sortedGlobal1, sortedGlobal2}; + + // Each time merge up to 4 queues into 1 queue. + constexpr uint8_t PREFIX_QUE_NUM = 4; + bool switchFlag = false; + GlobalTensor srcGlobal[4]; + int lengths[4]; + for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) { + for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) { + if ((GetBlockIdx() == blockIdx + bindBlock) || (GetBlockIdx() == blockIdx + bindBlock * 2) || + (GetBlockIdx() == blockIdx + bindBlock * 3)) { + PipeBarrier(); + IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID); + PipeBarrier(); + } else if (GetBlockIdx() == blockIdx) { + int mrgBlockNum = min(blockNum - blockIdx, bindBlock * PREFIX_QUE_NUM); + uint8_t queNum = (mrgBlockNum + bindBlock - 1) / bindBlock; + for (int i = 1; i < queNum; i++) { + PipeBarrier(); + IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)blockIdx + (bindBlock * i), eventID); + PipeBarrier(); + } + // 判断最后一个队列包含了多少个block的数据. + uint8_t lastQueBlockNum = mrgBlockNum % bindBlock; + if (lastQueBlockNum == 0) { + lastQueBlockNum = bindBlock; + } + // Init GMSSrcList, GMSParams + for (int i = 0; i < queNum; i++) { + srcGlobal[i] = + sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * SORT_DATATYPE_SIZE_FACTOR]; + } + for (int i = 0; i < queNum - 1; i++) { + lengths[i] = + GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - GetGlobalOffset(blockIdx + (bindBlock * i)); + } + lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - + GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1))); + GMSSrcList srcList {srcGlobal}; + GMSParams params {lengths, queNum, + {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}}; + MrgSortGM( + sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], srcList, params); + } + } + switchFlag = !switchFlag; + } + // Switch valid workspace pointer. + if (switchFlag) { + GlobalTensor tmpGlobal = sortedGlobal1; + sortedGlobal1 = sortedGlobal2; + sortedGlobal2 = tmpGlobal; + + GlobalTensor tmpGlobal1 = sortedBlock1; + sortedBlock1 = sortedBlock2; + sortedBlock2 = tmpGlobal1; + + GlobalTensor tmpGlobal2 = sortedBlock1AsInt; + sortedBlock1AsInt = sortedBlock2AsInt; + sortedBlock2AsInt = tmpGlobal2; + } + calcBuf[0].FreeTensor(sortedLocal1); + calcBuf[1].FreeTensor(sortedLocal2); + calcBuf[2].FreeTensor(mrgLocal); +} + +template +__aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor& dstVal, + const LocalTensor& srcLocal, const LocalTensor& shiftedLocal, const LocalTensor& bitMask32, + const uint16_t elemLength, uint64_t& tileUniqueCnt) +{ + LocalTensor bitMask16 = bitMask32.ReinterpretCast(); + uint64_t rsvdCnt = 0; + // Seperate Val and Idx. + GatherMask(dstVal, srcLocal, 1, false, 0, {1, static_cast((elemLength * 2 + 63) / 64), 8, 0}, rsvdCnt); + PipeBarrier(); + + // Gen bitMask to calc shifted array. + Duplicate(bitMask16, (uint16_t)0b1111111111111111, elemLength / 16); + PipeBarrier(); + bitMask16.SetValue(0, 0b1111111111111110); + + // Calc shifted array. + GatherMask(shiftedLocal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, rsvdCnt); + PipeBarrier(); + // Set the last val as INF in order to avoid dropping the last unique val. + shiftedLocal.SetValue(elemLength - 1, -FLOAT_INF); + + // Generate bitMask which represents unique numbers. + Compare(bitMask16, dstVal, shiftedLocal, CMPMODE::NE, (elemLength + 63) / 64 * 64); + PipeBarrier(); + + // Gather unique numbers and their idx. + GatherMask(dstVal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, tileUniqueCnt); + PipeBarrier(); +} + +template +__aicore__ inline void KernelUnique::TileUnique(const int32_t progress) +{ + LocalTensor bitMask32 = calcBuf[0].AllocTensor(); + LocalTensor shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast(); + LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].AllocTensor(); + LocalTensor uniqueCntLocal = shiftedLocal.ReinterpretCast(); + uint64_t tileUniqueCnt; + uint64_t tmpRsvdCnt; + + DataCopy(sortedLocal1, sortedBlock1[progress * TILE_LEN_ELEM], TILE_LEN_ELEM); + PipeBarrier(); + + ConsecutiveUnique(sortedLocal2, sortedLocal1, shiftedLocal, bitMask32, TILE_LENGTH, tileUniqueCnt); + // If has inf, append. + if ((progress == tileNum - 1) && hasInfFlag) { + sortedLocal2.SetValue(tileUniqueCnt, -FLOAT_INF); + tileUniqueCnt++; + } + + if (tileUniqueCnt != 0) { + accUniqueCnt += tileUniqueCnt; + if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) { + accUniqueCnt--; + } + DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], sortedLocal2, + {1, static_cast(sizeof(float) * tileUniqueCnt), 0, 0}); + PipeBarrier(); + lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1); + } + + // upload uniqueCnt. + if (progress == tileNum - 1) { + uniqueCntLocal.SetValue(0, accUniqueCnt); + DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], uniqueCntLocal, + {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); + PipeBarrier(); + } + calcBuf[0].FreeTensor(shiftedLocal); + calcBuf[1].FreeTensor(sortedLocal1); + calcBuf[2].FreeTensor(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::CopyOut() +{ + LocalTensor copyLocal0 = calcBuf[0].AllocTensor(); + LocalTensor copyLocal1 = calcBuf[1].AllocTensor(); + LocalTensor IBSyncLocal = copyLocal1.ReinterpretCast(); + LocalTensor copyLocal2 = calcBuf[2].AllocTensor(); + + uint64_t lastAccUniqueCnt = 0; + // Get every blockUniqueCnt before current block. Calc accumulate uniqueCnt. + for (int i = 0; i < GetBlockIdx(); i++) { + uint64_t lastUniqueCnt = blockUniqueCntGlobal.GetValue(i); + lastAccUniqueCnt += lastUniqueCnt; + // If the first val of (i+1)th block equals to the last val of (i)th block, then they should be placed in + // the same position, accUniqueCnt--. + if (sortedGlobal1[GetGlobalOffset(i + 1) * SORT_DATATYPE_SIZE_FACTOR].GetValue(0) == + sortedGlobal1[GetGlobalOffset(i) * SORT_DATATYPE_SIZE_FACTOR].GetValue(lastUniqueCnt - 1)) { + lastAccUniqueCnt--; + } + } + uint64_t thisUniqueCnt = blockUniqueCntGlobal.GetValue(GetBlockIdx()); + + uint64_t restLen = thisUniqueCnt; + // max(Ta a, Tb b) function does not support compilation period calc. + constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float) ? sizeof(T) : sizeof(float); + LocalTensor copyVal32 = copyLocal0.template ReinterpretCast(); + LocalTensor uniqueVal32 = copyLocal1.ReinterpretCast(); + // Copy unique values (and counts) from Workspace to dst. + while (restLen > 0) { + // DataCopyPad could copy up to 65535B in one cycle. And one tile may contain up to 65536B. So we should + // process multiple cycles. + uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize); + copyLen = min(copyLen, 65535 / bottleneckTypeSize); + if constexpr (!IsSameType::value) { + DataCopyPad(copyLocal1, sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0}); + PipeBarrier(); + Muls(copyLocal1, copyLocal1, (float)-1, copyLen); + PipeBarrier(); + Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen); + PipeBarrier(); + } else { + DataCopyPad(copyLocal0, sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0}); + PipeBarrier(); + Muls(copyLocal0, copyLocal0, (float)-1, copyLen); + PipeBarrier(); + } + // DataCopyPad does not support int64_t. Copy them as uint32_t. + if constexpr (sizeof(T) > 4) { + DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)], + copyVal32, {1, static_cast(sizeof(T) * copyLen), 0, 0}); + } else { + DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], copyLocal0, + {1, static_cast(sizeof(T) * copyLen), 0, 0}); + } + PipeBarrier(); + restLen -= copyLen; + } + // Return unique count. + if (GetBlockIdx() == blockNum - 1) { + uniqueVal32.SetValue(0, lastAccUniqueCnt + thisUniqueCnt); + DataCopyPad(uniqueCntGlobal, uniqueVal32, {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); + PipeBarrier(); + } + calcBuf[0].FreeTensor(copyLocal0); + calcBuf[1].FreeTensor(copyLocal1); +} +} // namespace AscendC diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py index fe90e0168c38da55137933b563525e502b0cb354..f28b456265ee0811d8794f087bdf995bc16a0b20 100644 --- a/mx_driving/__init__.py +++ b/mx_driving/__init__.py @@ -61,6 +61,7 @@ __all__ = [ "diff_iou_rotated_2d", "nms3d_on_sight", "cartesian_to_frenet", + "npu_unique", "min_area_polygons" ] @@ -125,6 +126,7 @@ from .ops.npu_batch_matmul import npu_batch_matmul from .ops.nms3d_on_sight import nms3d_on_sight from .ops.cartesian_to_frenet import cartesian_to_frenet from .patcher import default_patcher_builder, patch_mmcv_version +from .ops.npu_unique import npu_unique from .ops.min_area_polygons import min_area_polygons diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp new file mode 100644 index 0000000000000000000000000000000000000000..314f41a0f8bc629894d164c48586ceaefbef7aa1 --- /dev/null +++ b/mx_driving/csrc/Unique.cpp @@ -0,0 +1,34 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" + + +at::Tensor npu_unique(const at::Tensor& input) +{ + TORCH_CHECK_NPU(voxels); + if (input.numel() < 2) { + at::Tensor output = at::Tensor(input).clone(); + return output; + } else { + at::Tensor output = at::empty({input.numel()}, at::TensorOptions().dtype(input.dtype()).device(input.device())); + at::Tensor uniqueCnt = at::empty({1}, at::TensorOptions().dtype(at::ScalarType::Int).device(input.device())); + EXEC_NPU_CMD_SYNC(aclnnUnique, input, output, uniqueCnt); + int uniqueCount = uniqueCnt.item(); + return output.narrow(0, 0, uniqueCount); + } +} diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp index 8e20d8a40c85d5f075c405da9ee15fc17ba2d500..ed8ad6ccf40f29c737ac248b52ffaecc58c7f7f7 100644 --- a/mx_driving/csrc/pybind.cpp +++ b/mx_driving/csrc/pybind.cpp @@ -242,6 +242,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) // npu_subm_sparse_conv3d_with_key m.def("npu_subm_sparse_conv3d_with_key", &npu_subm_sparse_conv3d_with_key); + + // npu_unique + m.def("npu_unique", &npu_unique); // min_area_polygons m.def("min_area_polygons", &min_area_polygons); diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..1fd367b3535d527867313280501fae00da258933 --- /dev/null +++ b/mx_driving/ops/npu_unique.py @@ -0,0 +1,21 @@ +""" +Copyright (c) OpenMMLab. All rights reserved. +Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +Modification by: Huawei Developers +Modification date: 2024-06-04 +Modification Description: +Modification 1. Add support for Ascend NPU +""" +from torch.autograd import Function +import mx_driving._C + + +class UniqueFunction(Function): + @staticmethod + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + def forward(ctx, input_tensor): + y = mx_driving._C.npu_unique(input_tensor) + return y + + +npu_unique = UniqueFunction.apply diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py new file mode 100644 index 0000000000000000000000000000000000000000..d914475294f02a91ce9068c21d09e45fbd7ba003 --- /dev/null +++ b/tests/torch/test_npu_unique.py @@ -0,0 +1,68 @@ +""" +Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +""" +import random +import os +import torch +import torch_npu +from data_cache import golden_data_cache +from torch_npu.testing.testcase import TestCase, run_tests +from mx_driving import npu_unique + + +def gen_inputs(input_shape, dtype): + input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype) + return input_tensor + + +def gen_cpu_outputs(input_tensor): + cpu_result = torch.unique(input_tensor) + return cpu_result + + +def gen_npu_outputs(input_tensor): + npu_result = npu_unique(input_tensor.npu()) + return npu_result.cpu() + + +class TestNpuUnique(TestCase): + def test_bfloat16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.bfloat16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().to(torch.float32).numpy(), + npu_result.cpu().detach().to(torch.float32).numpy()) + + def test_float16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_float32(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float32) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int32(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int32) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int64(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int64) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + +if __name__ == "__main__": + run_tests()