From 2e5403353281d1a093a8faba28b0630b84146749 Mon Sep 17 00:00:00 2001 From: huangyuan Date: Mon, 12 May 2025 15:50:48 +0800 Subject: [PATCH] deformable aggregation ops optim and fp16 adapt --- kernels/op_host/deformable_aggregation.cpp | 51 ++-- .../op_host/deformable_aggregation_grad.cpp | 62 ++-- .../op_host/deformable_aggregation_tiling.h | 1 - kernels/op_kernel/deformable_aggregation.cpp | 137 ++++----- .../op_kernel/deformable_aggregation_grad.cpp | 264 +++++++++--------- model_examples/SparseDrive/README.md | 6 +- mx_driving/ops/npu_deformable_aggregation.py | 18 +- 7 files changed, 272 insertions(+), 267 deletions(-) diff --git a/kernels/op_host/deformable_aggregation.cpp b/kernels/op_host/deformable_aggregation.cpp index 71233b6d..0f13236b 100644 --- a/kernels/op_host/deformable_aggregation.cpp +++ b/kernels/op_host/deformable_aggregation.cpp @@ -23,6 +23,7 @@ namespace { constexpr uint32_t SINGLE = 1; constexpr uint32_t BYTE_BLOCK = 32; constexpr uint32_t SIZE_OF_FP32 = 4; +constexpr uint32_t SIZE_OF_FP16 = 2; constexpr uint32_t BATCH_SIZE_IDX = 0; constexpr uint32_t FEAT_IDX = 1; constexpr uint32_t EMBEDS_IDX = 2; @@ -53,8 +54,6 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte return ge::GRAPH_FAILED; } - auto dtype = context->GetInputDesc(0)->GetDataType(); - auto attrs = context->GetAttrs(); if (attrs == nullptr) { return ge::GRAPH_FAILED; @@ -76,16 +75,11 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte auto numScales = getAttr(SCALE_IDX); auto numGroups = getAttr(GROUPS_IDX); - uint32_t alignNum = BYTE_BLOCK / SIZE_OF_FP32; + bool dtype = context->GetInputDesc(0)->GetDataType() == ge::DT_FLOAT; + uint32_t dataByteNum = dtype ? SIZE_OF_FP32 : SIZE_OF_FP16; + uint32_t alignNum = BYTE_BLOCK / dataByteNum; uint32_t cAligned = CeilAlign(static_cast(numEmbeds), alignNum); - uint64_t ubSize; - ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize); - // 计算除weightBuf_所占空间以外的其他ub大小,并流出预留量(16 * 1024) - uint64_t usedUbSize = (16 * 1024 + 6 * cAligned + numPoints * numCams * 2 + numCams * numScales * 3) * SIZE_OF_FP32; - // 判断weightBuf_是否能放下包括numPoints大小的数据,分情况在不同位置进行数据搬运 - bool memoryFlag = (ubSize - usedUbSize) > numPoints * numCams * numScales * numGroups * SIZE_OF_FP32; - context->SetBlockDim(coreNum); tiling.set_bs(bs); @@ -97,7 +91,6 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte tiling.set_numScales(numScales); tiling.set_numGroups(numGroups); tiling.set_cAligned(cAligned); - tiling.set_memoryFlag(memoryFlag); tiling.set_coreNum(coreNum); if (context->GetRawTilingData() == nullptr) { @@ -151,34 +144,34 @@ public: { this->Input("mc_ms_feat") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("spatial_shape") .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("scale_start_index") .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("sampling_location") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("weights") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("out") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Attr("batch_size").AttrType(REQUIRED).Int(); this->Attr("num_feat").AttrType(REQUIRED).Int(); diff --git a/kernels/op_host/deformable_aggregation_grad.cpp b/kernels/op_host/deformable_aggregation_grad.cpp index caab5fe6..80ed3211 100644 --- a/kernels/op_host/deformable_aggregation_grad.cpp +++ b/kernels/op_host/deformable_aggregation_grad.cpp @@ -25,6 +25,7 @@ namespace { constexpr uint32_t SINGLE = 1; constexpr uint32_t BYTE_BLOCK = 32; constexpr uint32_t SIZE_OF_FP32 = 4; +constexpr uint32_t SIZE_OF_FP16 = 2; const uint32_t INPUT_FEAT = 0; const uint32_t INPUT_SPATIAL_SHAPE = 1; @@ -88,10 +89,13 @@ static ge::graphStatus TilingForDeformableAggregationGrad(gert::TilingContext* c uint32_t tailWeightNum = Tail(totalTask, avgWeightNum); usedCoreNum = Ceil(totalTask, avgWeightNum); + bool dtype = context->GetInputDesc(INPUT_FEAT)->GetDataType() == ge::DT_FLOAT; + uint32_t dataTypeSize = dtype ? SIZE_OF_FP32 : SIZE_OF_FP16; + uint64_t ubSize; ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize); - uint64_t usedUbSize = (10 * 1024 + 22 * numEmbeds + numCams * numScale * numGroups + numPoints * numCams * 10) * SIZE_OF_FP32; - uint32_t singleProcessTaskLen = (ubSize - usedUbSize) / SIZE_OF_FP32 / (numEmbeds); + uint64_t usedUbSize = (10 * 1024 + 15 * numEmbeds + 2 * numScale * numEmbeds + 2 * numScale * numGroups + 2 * numPoints * numCams) * dataTypeSize; + uint32_t singleProcessTaskLen = (ubSize - usedUbSize) / dataTypeSize / numEmbeds; context->SetBlockDim(usedCoreNum); tiling.set_usedCoreNum(usedCoreNum); @@ -155,49 +159,49 @@ public: { this->Input("mc_ms_feat") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("spatial_shape") .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("scale_start_index") .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("sampling_location") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("weights") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Input("grad_output") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("grad_mc_ms_feat") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("grad_sampling_location") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("grad_weights") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->SetInferShape(ge::InferShapeForDeformableAggregationGrad) .SetInferDataType(ge::InferDataTypeForDeformableAggregationGrad); diff --git a/kernels/op_host/deformable_aggregation_tiling.h b/kernels/op_host/deformable_aggregation_tiling.h index a14c7816..93d40c23 100644 --- a/kernels/op_host/deformable_aggregation_tiling.h +++ b/kernels/op_host/deformable_aggregation_tiling.h @@ -16,7 +16,6 @@ TILING_DATA_FIELD_DEF(uint32_t, numCams); TILING_DATA_FIELD_DEF(uint32_t, numScales); TILING_DATA_FIELD_DEF(uint32_t, numGroups); TILING_DATA_FIELD_DEF(uint32_t, cAligned); -TILING_DATA_FIELD_DEF(uint32_t, memoryFlag); TILING_DATA_FIELD_DEF(uint32_t, coreNum); END_TILING_DATA_DEF; diff --git a/kernels/op_kernel/deformable_aggregation.cpp b/kernels/op_kernel/deformable_aggregation.cpp index 70152a96..df85cd65 100644 --- a/kernels/op_kernel/deformable_aggregation.cpp +++ b/kernels/op_kernel/deformable_aggregation.cpp @@ -23,20 +23,28 @@ public: numScales_ = tiling_data->numScales; numGroups_ = tiling_data->numGroups; cAligned_ = tiling_data->cAligned; - memoryFlag_ = tiling_data->memoryFlag; coreNum_ = tiling_data->coreNum; numChannels_ = numEmbeds_ / numGroups_; - weightBufSize_ = memoryFlag_ ? numPoints_ * numCams_ * numScales_ * numGroups_ : numCams_ * numScales_ * numGroups_; - weightBufSize_ = AlignUp(weightBufSize_, blockAlign_); - locBufSize_ = AlignUp(numPoints_ * numCams_ * 2, blockAlign_); - scaleStartBufSize_ = AlignUp(numCams_ * numScales_, blockAlign_); - spatialShapeBufSize_ = AlignUp(numCams_ * numScales_ * 2, blockAlign_); + uint32_t blockSize_ = 32; + blockAlignFloat_ = blockSize_ / sizeof(DTYPE_F); + blockAlignInt_ = blockSize_ / sizeof(DTYPE_I); + + weightBufSize_ = AlignUp(numScales_ * numGroups_, blockAlignFloat_); + locBufSize_ = AlignUp(numPoints_ * numCams_ * 2, blockAlignFloat_); + scaleStartBufSize_ = AlignUp(numCams_ * numScales_, blockAlignInt_); + spatialShapeBufSize_ = AlignUp(numCams_ * numScales_ * 2, blockAlignInt_); + weightMulBufSize_ = numScales_ * cAligned_; + + v1Offset_ = 0 * weightMulBufSize_; + v2Offset_ = 1 * weightMulBufSize_; + v3Offset_ = 2 * weightMulBufSize_; + v4Offset_ = 3 * weightMulBufSize_; copyOutParams_ = {1, static_cast(numEmbeds_ * sizeof(DTYPE_F)), 0, 0, 0}; - srcShape_[0] = numGroups_; + srcShape_[0] = numScales_ * numGroups_; srcShape_[1] = 1; - dstShape_[0] = numGroups_; + dstShape_[0] = numScales_ * numGroups_; dstShape_[1] = numChannels_; ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); @@ -62,8 +70,8 @@ public: pipe_->InitBuffer(locationBuf_, locBufSize_ * sizeof(DTYPE_F)); pipe_->InitBuffer(scaleStartBuf_, scaleStartBufSize_ * sizeof(DTYPE_I)); pipe_->InitBuffer(spatialShapeBuf_, spatialShapeBufSize_ * sizeof(DTYPE_I)); - pipe_->InitBuffer(vBuf_, 4 * cAligned_ * sizeof(DTYPE_F)); - pipe_->InitBuffer(weightMulBuf_, cAligned_ * sizeof(DTYPE_F)); + pipe_->InitBuffer(vBuf_, 4 * weightMulBufSize_ * sizeof(DTYPE_F)); + pipe_->InitBuffer(weightMulBuf_, 4 * weightMulBufSize_ * sizeof(DTYPE_F)); pipe_->InitBuffer(resBuf_, cAligned_ * sizeof(DTYPE_F)); weightLocal_ = weightBuf_.Get(); @@ -86,8 +94,10 @@ public: endOffset_ = taskNum_; } + Duplicate(vLocal_, static_cast(0.0f), 4 * weightMulBufSize_); DataCopy(scaleStartLocal_, scaleStartIndexGm_, scaleStartBufSize_); DataCopy(spatialShapeLocal_, spatialShapesGm_, spatialShapeBufSize_); + for (uint32_t taskIdx = startOffset_; taskIdx < endOffset_; ++taskIdx) { ComputeAndCopyOut(taskIdx); } @@ -100,56 +110,57 @@ public: uint64_t refOffsetGm = (batchIdx * numAnchors_ + anchorIdx) * numEmbeds_; uint64_t locationOffsetGm = (batchIdx * numAnchors_ + anchorIdx) * numPoints_ * numCams_ * 2; - if (memoryFlag_) { - uint64_t weightOffsetGm = (batchIdx * numAnchors_ + - anchorIdx) * numPoints_ * numCams_ * numScales_ * numGroups_; - SetFlag(0); - WaitFlag(0); - DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_); - } DataCopy(locationLocal_, samplingLocationGm_[locationOffsetGm], locBufSize_); - Duplicate(resLocal_, 0.0f, cAligned_); + Duplicate(resLocal_, static_cast(0.0f), cAligned_); for (uint32_t pointIdx = 0; pointIdx < numPoints_; ++pointIdx) { - if (!memoryFlag_) { - uint64_t weightOffsetGm = (batchIdx * numAnchors_ * numPoints_ + - anchorIdx * numPoints_ + pointIdx) * numCams_ * numScales_ * numGroups_; - SetFlag(0); - WaitFlag(0); - DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_); - } - uint32_t weightBaseOffsetLocal = memoryFlag_ ? pointIdx * numCams_ * numScales_ * numGroups_ : 0; for (uint32_t camIdx = 0; camIdx < numCams_; ++camIdx) { uint32_t locationOffsetLocal = (pointIdx * numCams_ + camIdx) * 2; - DTYPE_F locW = locationLocal_.GetValue(locationOffsetLocal); + float locW = locationLocal_.GetValue(locationOffsetLocal); if (locW <= 0 || locW >= 1) { continue; } - DTYPE_F locH = locationLocal_.GetValue(locationOffsetLocal + 1); + float locH = locationLocal_.GetValue(locationOffsetLocal + 1); if (locH <= 0 || locH >= 1) { continue; } + uint64_t weightOffsetGm = (((batchIdx * numAnchors_ + anchorIdx) * numPoints_ + pointIdx) + * numCams_ + camIdx) * numScales_ * numGroups_; + DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_); + SetFlag(0); + WaitFlag(0); + BroadCast(weightMulLocal_, weightLocal_, dstShape_, srcShape_); + SetFlag(0); + WaitFlag(0); + for (uint32_t i = 1; i < 4; ++i) { + Adds(weightMulLocal_[i * weightMulBufSize_], weightMulLocal_, static_cast(0.0f), weightMulBufSize_); + } for (uint32_t scaleIdx = 0; scaleIdx < numScales_; ++scaleIdx) { - uint32_t weightOffsetLocal = weightBaseOffsetLocal + (camIdx * numScales_ + scaleIdx) * numGroups_; + uint32_t localOffset = scaleIdx * cAligned_; uint32_t scaleStartOffset = camIdx * numScales_ + scaleIdx; uint32_t spatialShapeOffset = scaleStartOffset * 2; uint32_t scaleStartIdx = scaleStartLocal_.GetValue(scaleStartOffset); uint32_t valueOffset = (batchIdx * numFeats_ + scaleStartIdx) * numEmbeds_; + uint32_t localPtr1_ = v1Offset_ + localOffset; + uint32_t localPtr2_ = v2Offset_ + localOffset; + uint32_t localPtr3_ = v3Offset_ + localOffset; + uint32_t localPtr4_ = v4Offset_ + localOffset; + DTYPE_I h = spatialShapeLocal_.GetValue(spatialShapeOffset); DTYPE_I w = spatialShapeLocal_.GetValue(spatialShapeOffset + 1); - DTYPE_F hIm = locH * h - 0.5f; - DTYPE_F wIm = locW * w - 0.5f; + float hIm = locH * h - 0.5f; + float wIm = locW * w - 0.5f; - DTYPE_I hLow = ScalarCast(hIm); - DTYPE_I wLow = ScalarCast(wIm); + DTYPE_I hLow = ScalarCast(hIm); + DTYPE_I wLow = ScalarCast(wIm); DTYPE_I hHigh = hLow + 1; DTYPE_I wHigh = wLow + 1; - DTYPE_F lh = hIm - hLow; - DTYPE_F lw = wIm - wLow; - DTYPE_F hh = 1 - lh; - DTYPE_F hw = 1 - lw; + float lh = hIm - hLow; + float lw = wIm - wLow; + float hh = 1 - lh; + float hw = 1 - lw; DTYPE_I wStride = numEmbeds_; DTYPE_I hStride = w * wStride; @@ -158,49 +169,46 @@ public: DTYPE_I wLowPtrOffset = wLow * wStride; DTYPE_I wHighPtrOffset = wLowPtrOffset + wStride; - DTYPE_F w1 = hh * hw; - DTYPE_F w2 = hh * lw; - DTYPE_F w3 = lh * hw; - DTYPE_F w4 = lh * lw; - - Duplicate(vLocal_, 0.0f, 4 * cAligned_); - - SetFlag(0); - WaitFlag(0); + float w1 = hh * hw; + float w2 = hh * lw; + float w3 = lh * hw; + float w4 = lh * lw; if (hLow >= 0) { basePtr_ = valueOffset + hLowPtrOffset; if (wLow >= 0) { realPtr_ = basePtr_ + wLowPtrOffset; - DataCopy(vLocal_[v1Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_); + DataCopy(vLocal_[localPtr1_], mcMsFeatGm_[realPtr_], cAligned_); } if (wHigh <= w - 1) { realPtr_ = basePtr_ + wHighPtrOffset; - DataCopy(vLocal_[v2Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_); + DataCopy(vLocal_[localPtr2_], mcMsFeatGm_[realPtr_], cAligned_); } } - if (hHigh <= h - 1) { basePtr_ = valueOffset + hHighPtrOffset; if (wLow >= 0) { realPtr_ = basePtr_ + wLowPtrOffset; - DataCopy(vLocal_[v3Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_); + DataCopy(vLocal_[localPtr3_], mcMsFeatGm_[realPtr_], cAligned_); } if (wHigh <= w - 1) { realPtr_ = basePtr_ + wHighPtrOffset; - DataCopy(vLocal_[v4Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_); + DataCopy(vLocal_[localPtr4_], mcMsFeatGm_[realPtr_], cAligned_); } } - - SetFlag(0); - WaitFlag(0); - Muls(vLocal_[v1Offset_ * cAligned_], vLocal_[v1Offset_ * cAligned_], w1, cAligned_); - Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v2Offset_ * cAligned_], w2, cAligned_); - Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v3Offset_ * cAligned_], w3, cAligned_); - Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v4Offset_ * cAligned_], w4, cAligned_); - - BroadCast(weightMulLocal_, weightLocal_[weightOffsetLocal], dstShape_, srcShape_); - MulAddDst(resLocal_, vLocal_[v1Offset_ * cAligned_], weightMulLocal_, cAligned_); + Muls(weightMulLocal_[localPtr1_], weightMulLocal_[localPtr1_], static_cast(w1), cAligned_); + Muls(weightMulLocal_[localPtr2_], weightMulLocal_[localPtr2_], static_cast(w2), cAligned_); + Muls(weightMulLocal_[localPtr3_], weightMulLocal_[localPtr3_], static_cast(w3), cAligned_); + Muls(weightMulLocal_[localPtr4_], weightMulLocal_[localPtr4_], static_cast(w4), cAligned_); + } + SetFlag(0); + WaitFlag(0); + Mul(weightMulLocal_, weightMulLocal_, vLocal_, 4 * weightMulBufSize_); + Duplicate(vLocal_, static_cast(0.0f), 4 * weightMulBufSize_); + SetFlag(0); + WaitFlag(0); + for (uint32_t i = 0; i < 4 * numScales_; ++i) { + Add(resLocal_, resLocal_, weightMulLocal_[i * cAligned_], cAligned_); } } } @@ -224,13 +232,12 @@ private: LocalTensor spatialShapeLocal_, scaleStartLocal_; LocalTensor vLocal_, weightMulLocal_, resLocal_; - bool memoryFlag_; uint32_t basePtr_, realPtr_; uint32_t coreNum_, curBlockIdx_; uint32_t taskNum_, taskNumPerCore_, startOffset_, endOffset_; - uint32_t weightBufSize_, locBufSize_, scaleStartBufSize_, spatialShapeBufSize_; + uint32_t weightBufSize_, locBufSize_, scaleStartBufSize_, spatialShapeBufSize_, weightMulBufSize_; uint32_t bs_, numFeats_, numEmbeds_, numAnchors_, numPoints_, numCams_, numScales_, numGroups_, numChannels_, cAligned_; - uint32_t blockAlign_ = 8; + uint32_t blockAlignFloat_, blockAlignInt_; uint32_t v1Offset_ = 0, v2Offset_ = 1, v3Offset_ = 2, v4Offset_ = 3; uint32_t srcShape_[2], dstShape_[2]; @@ -243,7 +250,7 @@ extern "C" __global__ __aicore__ void deformable_aggregation(GM_ADDR mc_ms_feat, { TPipe pipe; GET_TILING_DATA(tiling_data, tiling); - KernelDeformableAggregation op; + KernelDeformableAggregation op; op.Init(mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, out, &tiling_data, &pipe); op.GetLocalTensor(); op.Process(); diff --git a/kernels/op_kernel/deformable_aggregation_grad.cpp b/kernels/op_kernel/deformable_aggregation_grad.cpp index ef4f0784..e7371449 100644 --- a/kernels/op_kernel/deformable_aggregation_grad.cpp +++ b/kernels/op_kernel/deformable_aggregation_grad.cpp @@ -6,6 +6,7 @@ #include "kernel_tiling/kernel_tiling.h" using namespace AscendC; +template class KernelDeformableAggregationGrad { public: __aicore__ inline KernelDeformableAggregationGrad() = delete; @@ -29,7 +30,6 @@ public: sampling_location, weights, grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights); InitBuffer(); - InitEvent(); } __aicore__ inline void Process(); @@ -46,7 +46,8 @@ private: if (coreId == usedCoreNum_ - 1) { totalTaskNum_ = tailWeightNum_; } - singleProcessTaskLen_ = tiling.singleProcessTaskLen; + singleProcessTaskLen_ = min(tiling.singleProcessTaskLen, totalTaskNum_); + singleProcessTaskLen_ = max(singleProcessTaskLen_, (uint32_t)1); taskRepeatTimes = (totalTaskNum_ - 1) / singleProcessTaskLen_ + 1; pts_ = tiling.numPoints; cam_ = tiling.numCams; @@ -56,6 +57,9 @@ private: numFeat = tiling.numFeat; numAnchors = tiling.numAnchors; totalGroups = numEmbeds / group_; + + blockSize_ = 32; + blockDataNum_ = blockSize_ / sizeof(DTYPE_F); } __aicore__ inline void InitGM(GM_ADDR mc_ms_feat, GM_ADDR spatial_shape, GM_ADDR scale_start_index, @@ -64,89 +68,84 @@ private: { int64_t samplingLocationOffset = taskOffset * pts_ * cam_ * 2; int64_t weightOffset = taskOffset * pts_ * cam_ * scale_ * group_; - mcMsFeatGm.SetGlobalBuffer((__gm__ float*)(mc_ms_feat)); + mcMsFeatGm.SetGlobalBuffer((__gm__ DTYPE_F*)(mc_ms_feat)); spatialShapeGm.SetGlobalBuffer((__gm__ int32_t*)(spatial_shape)); scaleStartLocationGm.SetGlobalBuffer((__gm__ int32_t*)(scale_start_index)); - samplingLocationGm.SetGlobalBuffer((__gm__ float*)(sampling_location) + samplingLocationOffset); - weightGm.SetGlobalBuffer((__gm__ float*)(weights) + weightOffset); - outputGradGm.SetGlobalBuffer((__gm__ float*)(grad_output) + taskOffset * numEmbeds); - gradMcMsFeatGm.SetGlobalBuffer((__gm__ float*)(grad_mc_ms_feat)); - gradSamplingLocationGm.SetGlobalBuffer((__gm__ float*)(grad_sampling_location) + samplingLocationOffset * 4); - gradWeightsGm.SetGlobalBuffer((__gm__ float*)(grad_weights) + weightOffset); + samplingLocationGm.SetGlobalBuffer((__gm__ DTYPE_F*)(sampling_location) + samplingLocationOffset); + weightGm.SetGlobalBuffer((__gm__ DTYPE_F*)(weights) + weightOffset); + outputGradGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_output) + taskOffset * numEmbeds); + gradMcMsFeatGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_mc_ms_feat)); + gradSamplingLocalGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_sampling_location) + samplingLocationOffset); + gradWeightsGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_weights) + weightOffset); } __aicore__ inline void InitBuffer() { - uint64_t singleWeightOffset = cam_ * scale_ * group_; + uint64_t singleWeightOffset = scale_ * group_; uint64_t samplingOffset = pts_ * cam_ * 2; - pipe_->InitBuffer(weightQue_, AlignUp(singleWeightOffset, B32_DATA_NUM_PER_BLOCK) * sizeof(float)); - pipe_->InitBuffer(gradOutputQue_, singleProcessTaskLen_ * numEmbeds * sizeof(float)); + pipe_->InitBuffer(weightQue_, AlignUp(singleWeightOffset, blockDataNum_) * sizeof(DTYPE_F)); + pipe_->InitBuffer(gradOutputQue_, singleProcessTaskLen_ * numEmbeds * sizeof(DTYPE_F)); pipe_->InitBuffer(scaleStartLocationQue_, AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t)); - pipe_->InitBuffer(samplingLocationQue_, AlignUp(samplingOffset, B32_DATA_NUM_PER_BLOCK) * sizeof(float)); - pipe_->InitBuffer(spatialShapeQue_, AlignUp(cam_ * scale_* 2, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t)); - pipe_->InitBuffer(topGradMcMsFeatQue_, 5 * numEmbeds * sizeof(float)); - pipe_->InitBuffer(vQue_, 4 * numEmbeds * sizeof(float)); - pipe_->InitBuffer(featureQue_, 4 * numEmbeds * sizeof(float)); - pipe_->InitBuffer(featureQue__, numEmbeds * sizeof(float)); - pipe_->InitBuffer(pointGradWeightQue_, 8 * numEmbeds * sizeof(float)); - pipe_->InitBuffer(gradSamplingQue_, 4 * samplingOffset * sizeof(float)); - pipe_->InitBuffer(sumTmp_, 8 * sizeof(float)); + pipe_->InitBuffer(samplingLocationQue_, AlignUp(samplingOffset, blockDataNum_) * sizeof(DTYPE_F)); + pipe_->InitBuffer(spatialShapeQue_, AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t)); + pipe_->InitBuffer(topGradMcMsFeatQue_, numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(gradValueQue_, 4 * numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(vQue_, 4 * numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(featureQue_, scale_ * numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(gradWeightsQue_, scale_ * group_ * sizeof(DTYPE_F)); + pipe_->InitBuffer(pointGradWeightQue_, 4 * numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(gradSamplingQue_, blockDataNum_ * sizeof(DTYPE_F)); + pipe_->InitBuffer(pointGradQue_, 2 * numEmbeds * sizeof(DTYPE_F)); + pipe_->InitBuffer(weightBrobQue_, scale_ * numEmbeds * sizeof(DTYPE_F)); } - __aicore__ inline void InitEvent() + __aicore__ inline void Prepare() { - cpInEvtID_ = pipe_->FetchEventID(HardEvent::MTE2_V); - cpOutEvtID_ = pipe_->FetchEventID(HardEvent::MTE3_MTE2); - vToOutEvtID_ = pipe_->FetchEventID(HardEvent::V_MTE3); - vToMTE2EvtID_ = pipe_->FetchEventID(HardEvent::V_MTE2); - mte3ToVEvtID_ = pipe_->FetchEventID(HardEvent::MTE3_V); + int32_t scaleStartNum = AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK); + int32_t spatialShapeNum = AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK); + scaleStartLocation = scaleStartLocationQue_.Get(); + spatialShape = spatialShapeQue_.Get(); + weight = weightQue_.Get(); + gradOutput = gradOutputQue_.Get(); + samplingLocation = samplingLocationQue_.Get(); + + gradWeightsLocal = gradWeightsQue_.Get(); + gradSamplingLocal = gradSamplingQue_.Get(); + gradValueLocal = gradValueQue_.Get(); + + topGradMcMsFeatLocal = topGradMcMsFeatQue_.Get(); + vLocal = vQue_.Get(); + featureLocal = featureQue_.Get(); + pointGradWeightLocal = pointGradWeightQue_.Get(); + pointGradSum = pointGradQue_.Get(); + weightBrobLocal = weightBrobQue_.Get(); + + Duplicate(pointGradSum, (DTYPE_F)0, 2 * numEmbeds); + Duplicate(featureLocal, (DTYPE_F)0, scale_ * numEmbeds); + Duplicate(vLocal, (DTYPE_F)0, numEmbeds * 4); + + DataCopy(scaleStartLocation, scaleStartLocationGm, scaleStartNum); + DataCopy(spatialShape, spatialShapeGm, spatialShapeNum); } __aicore__ inline void ProcessSingle(uint64_t taskIdx, uint32_t actualWeightNum) { - uint64_t singleWeightOffset = cam_ * scale_ * group_; - uint32_t weightCopyLen = AlignUp(singleWeightOffset, B32_DATA_NUM_PER_BLOCK); - int32_t gradOuputNum = AlignUp(actualWeightNum * numEmbeds, B32_DATA_NUM_PER_BLOCK); - int32_t samplingLocationNum = AlignUp(pts_ * cam_ * 2, B32_DATA_NUM_PER_BLOCK); - int32_t scaleStartNum = AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK); - int32_t spatialShapeNum = AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK); + uint64_t singleWeightOffset = scale_ * group_; + uint32_t weightCopyLen = AlignUp(singleWeightOffset, blockDataNum_); + int32_t gradOuputNum = AlignUp(actualWeightNum * numEmbeds, blockDataNum_); + int32_t samplingLocationNum = AlignUp(pts_ * cam_ * 2, blockDataNum_); uint64_t gradOutputOffset = taskIdx * singleProcessTaskLen_ * numEmbeds; - LocalTensor weight = weightQue_.Get(); - LocalTensor gradOutput = gradOutputQue_.Get(); - LocalTensor samplingLocation = samplingLocationQue_.Get(); - LocalTensor scaleStartLocation = scaleStartLocationQue_.Get(); - LocalTensor spatialShape = spatialShapeQue_.Get(); - - LocalTensor topGradMcMsFeatLocal = topGradMcMsFeatQue_.Get(); - LocalTensor vLocal = vQue_.Get(); - LocalTensor featureLocal = featureQue_.Get(); - LocalTensor featureLocal_ = featureQue__.Get(); - LocalTensor pointGradWeightLocal = pointGradWeightQue_.Get(); - LocalTensor gradSamplingLocation = gradSamplingQue_.Get(); - LocalTensor tmpLocation = sumTmp_.Get(); - - SetFlag(vToMTE2EvtID_); - WaitFlag(vToMTE2EvtID_); + SetFlag(0); + WaitFlag(0); DataCopy(gradOutput, outputGradGm[gradOutputOffset], gradOuputNum); - - DataCopy(scaleStartLocation, scaleStartLocationGm, scaleStartNum); - DataCopy(spatialShape, spatialShapeGm, spatialShapeNum); + for (int32_t weightNumId = 0; weightNumId < actualWeightNum; weightNumId++) { int64_t curBatch = (taskOffset + taskIdx * singleProcessTaskLen_ + weightNumId) / numAnchors; int64_t featOffset = curBatch * numFeat * numEmbeds; uint64_t samplingLocationOffset = (taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ * cam_ * 2; DataCopy(samplingLocation, samplingLocationGm[samplingLocationOffset], samplingLocationNum); - SetFlag(mte3ToVEvtID_); - WaitFlag(mte3ToVEvtID_); - Duplicate(gradSamplingLocation, (float)0, pts_ * cam_ * 8); for (int32_t ptsId = 0; ptsId < pts_; ptsId++) { - uint64_t weightGmOffset = ((taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ + ptsId) * singleWeightOffset; - SetFlag(vToMTE2EvtID_); - WaitFlag(vToMTE2EvtID_); - DataCopy(weight, weightGm[weightGmOffset], weightCopyLen); - SetFlag(cpInEvtID_); - WaitFlag(cpInEvtID_); for (int32_t camId = 0; camId < cam_; camId++) { int32_t locOffset = ptsId * cam_ + camId; float locW = samplingLocation.GetValue(locOffset * 2); @@ -154,6 +153,16 @@ private: if (locW <= 0 || locW >= 1 || locH <=0 || locH >=1) { continue; } + uint64_t weightGmOffset = (((taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ + ptsId) * cam_ + camId) * singleWeightOffset; + uint64_t samplingLocationCopyOutOffset = samplingLocationOffset + (ptsId * cam_ + camId) * 2; + DataCopy(weight, weightGm[weightGmOffset], weightCopyLen); + SetFlag(0); + WaitFlag(0); + uint32_t dstShape_[2] = {scale_ * group_, totalGroups}; + uint32_t srcShape_[2] = {scale_ * group_, 1}; + BroadCast(weightBrobLocal, weight, dstShape_, srcShape_); + SetFlag(0); + WaitFlag(0); for (int32_t scaleId = 0; scaleId < scale_; scaleId++) { int32_t scaleStartOffset = camId * scale_ + scaleId; int32_t scaleStartIdx = scaleStartLocation.GetValue(scaleStartOffset); @@ -185,118 +194,112 @@ private: uint64_t ptr3 = featureOffset + hHighPtrOffset + wLowPtrOffset; uint64_t ptr4 = featureOffset + hHighPtrOffset + wHighPtrOffset; - uint64_t weightOffset = (camId * scale_ + scaleId) * group_; + uint64_t weightOffset = scaleId * numEmbeds; uint64_t gradOuputBaseOffset = weightNumId * numEmbeds; - uint32_t dstShape_[2] = {group_, totalGroups}; - uint32_t srcShape_[2] = {group_, 1}; - - Duplicate(vLocal, (float)0, numEmbeds * 4); - SetFlag(vToMTE2EvtID_); - WaitFlag(vToMTE2EvtID_); + SetFlag(0); + WaitFlag(0); - SetFlag(mte3ToVEvtID_); - WaitFlag(mte3ToVEvtID_); + Mul(topGradMcMsFeatLocal, weightBrobLocal[weightOffset], gradOutput[gradOuputBaseOffset], numEmbeds); + Muls(gradValueLocal, topGradMcMsFeatLocal, static_cast(w1), numEmbeds); + Muls(gradValueLocal[numEmbeds * 1], topGradMcMsFeatLocal, static_cast(w2), numEmbeds); + Muls(gradValueLocal[numEmbeds * 2], topGradMcMsFeatLocal, static_cast(w3), numEmbeds); + Muls(gradValueLocal[numEmbeds * 3], topGradMcMsFeatLocal, static_cast(w4), numEmbeds); - BroadCast(topGradMcMsFeatLocal, weight[weightOffset], dstShape_, srcShape_); - Mul(topGradMcMsFeatLocal, topGradMcMsFeatLocal, gradOutput[gradOuputBaseOffset], numEmbeds); - Muls(topGradMcMsFeatLocal[numEmbeds], topGradMcMsFeatLocal, w1, numEmbeds); - Muls(topGradMcMsFeatLocal[numEmbeds * 2], topGradMcMsFeatLocal, w2, numEmbeds); - Muls(topGradMcMsFeatLocal[numEmbeds * 3], topGradMcMsFeatLocal, w3, numEmbeds); - Muls(topGradMcMsFeatLocal[numEmbeds * 4], topGradMcMsFeatLocal, w4, numEmbeds); + SetFlag(0); + WaitFlag(0); - SetFlag(vToOutEvtID_); - WaitFlag(vToOutEvtID_); - - SetAtomicAdd(); + SetAtomicAdd(); if (hLow >= 0 && wLow >=0) { - DataCopy(gradMcMsFeatGm[featOffset + ptr1], topGradMcMsFeatLocal[numEmbeds], numEmbeds); + DataCopy(gradMcMsFeatGm[featOffset + ptr1], gradValueLocal, numEmbeds); DataCopy(vLocal, mcMsFeatGm[featOffset + ptr1], numEmbeds); } if (hLow >= 0 && wHigh <= w - 1) { - DataCopy(gradMcMsFeatGm[featOffset + ptr2], topGradMcMsFeatLocal[numEmbeds * 2], numEmbeds); + DataCopy(gradMcMsFeatGm[featOffset + ptr2], gradValueLocal[numEmbeds * 1], numEmbeds); DataCopy(vLocal[numEmbeds], mcMsFeatGm[featOffset + ptr2], numEmbeds); } if (hHigh <= h - 1 && wLow >= 0) { - DataCopy(gradMcMsFeatGm[featOffset + ptr3], topGradMcMsFeatLocal[numEmbeds * 3], numEmbeds); + DataCopy(gradMcMsFeatGm[featOffset + ptr3], gradValueLocal[numEmbeds * 2], numEmbeds); DataCopy(vLocal[numEmbeds * 2], mcMsFeatGm[featOffset + ptr3], numEmbeds); } if (hHigh <= h - 1 && wHigh <= w - 1) { - DataCopy(gradMcMsFeatGm[featOffset + ptr4], topGradMcMsFeatLocal[numEmbeds * 4], numEmbeds); + DataCopy(gradMcMsFeatGm[featOffset + ptr4], gradValueLocal[numEmbeds * 3], numEmbeds); DataCopy(vLocal[numEmbeds * 3], mcMsFeatGm[featOffset + ptr4], numEmbeds); } SetAtomicNone(); - SetFlag(cpInEvtID_); - WaitFlag(cpInEvtID_); + SetFlag(0); + WaitFlag(0); - Muls(featureLocal, vLocal, w1, numEmbeds); - Muls(featureLocal[numEmbeds], vLocal[numEmbeds], w2, numEmbeds); - Muls(featureLocal[numEmbeds * 2], vLocal[numEmbeds * 2], w3, numEmbeds); - Muls(featureLocal[numEmbeds * 3], vLocal[numEmbeds * 3], w4, numEmbeds); - Add(featureLocal, featureLocal, featureLocal[numEmbeds], numEmbeds); - Add(featureLocal[numEmbeds * 2], featureLocal[numEmbeds * 2], featureLocal[numEmbeds * 3], numEmbeds); - Add(featureLocal, featureLocal, featureLocal[numEmbeds * 2], numEmbeds); - Mul(featureLocal, featureLocal, gradOutput[gradOuputBaseOffset], numEmbeds); + Muls(featureLocal[weightOffset], vLocal, static_cast(w1), numEmbeds); + Axpy(featureLocal[weightOffset], vLocal[numEmbeds], static_cast(w2), numEmbeds); + Axpy(featureLocal[weightOffset], vLocal[numEmbeds * 2], static_cast(w3), numEmbeds); + Axpy(featureLocal[weightOffset], vLocal[numEmbeds * 3], static_cast(w4), numEmbeds); + Mul(featureLocal[weightOffset], featureLocal[weightOffset], gradOutput[gradOuputBaseOffset], numEmbeds); - SetFlag(mte3ToVEvtID_); - WaitFlag(mte3ToVEvtID_); + Sub(pointGradWeightLocal, vLocal[numEmbeds * 1], vLocal, numEmbeds); + Sub(pointGradWeightLocal[numEmbeds * 2], vLocal[numEmbeds * 3], vLocal[numEmbeds * 2], numEmbeds); - Sum(featureLocal_, featureLocal, {group_, totalGroups, totalGroups}); + Sub(pointGradWeightLocal[numEmbeds * 1], vLocal[numEmbeds * 2], vLocal, numEmbeds); + Sub(pointGradWeightLocal[numEmbeds * 3], vLocal[numEmbeds * 3], vLocal[numEmbeds * 1], numEmbeds); + Duplicate(vLocal, (DTYPE_F)0, numEmbeds * 4); - SetFlag(vToOutEvtID_); - WaitFlag(vToOutEvtID_); + SetFlag(0); + WaitFlag(0); - SetAtomicAdd(); - DataCopy(gradWeightsGm[weightGmOffset + weightOffset], featureLocal_, group_); - SetAtomicNone(); + Muls(pointGradWeightLocal, pointGradWeightLocal, static_cast(hh), numEmbeds); + Axpy(pointGradWeightLocal, pointGradWeightLocal[numEmbeds * 2], static_cast(lh), numEmbeds); + + Muls(pointGradWeightLocal[numEmbeds * 1], pointGradWeightLocal[numEmbeds * 1], static_cast(hw), numEmbeds); + Axpy(pointGradWeightLocal[numEmbeds * 1], pointGradWeightLocal[numEmbeds * 3], static_cast(lw), numEmbeds); - Muls(pointGradWeightLocal, vLocal, hw, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 2], vLocal[numEmbeds], lw, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 4], vLocal[numEmbeds * 2], hw, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 6], vLocal[numEmbeds * 3], lw, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds], vLocal, hh, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 3], vLocal[numEmbeds], hh, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 5], vLocal[numEmbeds * 2], lh, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds * 7], vLocal[numEmbeds * 3], lh, numEmbeds); - Sub(pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal, numEmbeds); - Sub(pointGradWeightLocal[numEmbeds * 6], pointGradWeightLocal[numEmbeds * 6], pointGradWeightLocal[numEmbeds * 2], numEmbeds); - Sub(pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds], numEmbeds); - Sub(pointGradWeightLocal[numEmbeds * 7], pointGradWeightLocal[numEmbeds * 7], pointGradWeightLocal[numEmbeds * 5], numEmbeds); - Add(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal[numEmbeds * 6], numEmbeds); - Add(pointGradWeightLocal, pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds * 7], numEmbeds); Mul(pointGradWeightLocal, pointGradWeightLocal, topGradMcMsFeatLocal, numEmbeds); Mul(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], topGradMcMsFeatLocal, numEmbeds); - Muls(pointGradWeightLocal, pointGradWeightLocal, (float)w, numEmbeds); - Muls(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], (float)h, numEmbeds); - Sum(tmpLocation, pointGradWeightLocal, {2, numEmbeds, numEmbeds}); - Add(gradSamplingLocation[locOffset * 8], gradSamplingLocation[locOffset * 8], tmpLocation, 8); + Muls(pointGradWeightLocal, pointGradWeightLocal, (DTYPE_F)w, numEmbeds); + Muls(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], (DTYPE_F)h, numEmbeds); + + Add(pointGradSum, pointGradSum, pointGradWeightLocal, numEmbeds * 2); } + SetFlag(0); + WaitFlag(0); + Sum(gradWeightsLocal, featureLocal, {scale_ * group_, totalGroups, totalGroups}); + Sum(gradSamplingLocal, pointGradSum, {2, numEmbeds, numEmbeds}); + SetFlag(0); + WaitFlag(0); + Duplicate(featureLocal, (DTYPE_F)0, scale_ * numEmbeds); + Duplicate(pointGradSum, (DTYPE_F)0, 2 * numEmbeds); + DataCopyExtParams locationCopyParams {1, (uint32_t)(2 * sizeof(DTYPE_F)), 0, 0, 0}; + DataCopyExtParams weightsCopyParams {1, (uint32_t)(scale_ * group_ * sizeof(DTYPE_F)), 0, 0, 0}; + DataCopyPad(gradSamplingLocalGm[samplingLocationCopyOutOffset], gradSamplingLocal, locationCopyParams); + DataCopyPad(gradWeightsGm[weightGmOffset], gradWeightsLocal, weightsCopyParams); } } - SetFlag(vToOutEvtID_); - WaitFlag(vToOutEvtID_); - DataCopyExtParams copyParams {1, (uint32_t)(pts_ * cam_ * 8 * sizeof(float)), 0, 0, 0}; - DataCopyPad(gradSamplingLocationGm[samplingLocationOffset * 4], gradSamplingLocation, copyParams); } } private: TPipe* pipe_; - GlobalTensor mcMsFeatGm, samplingLocationGm, weightGm, outputGradGm; - GlobalTensor gradMcMsFeatGm, gradSamplingLocationGm, gradWeightsGm; + GlobalTensor mcMsFeatGm, samplingLocationGm, weightGm, outputGradGm; + GlobalTensor gradMcMsFeatGm, gradSamplingLocalGm, gradWeightsGm; GlobalTensor spatialShapeGm, scaleStartLocationGm; TBuf weightQue_, gradOutputQue_, samplingLocationQue_, scaleStartLocationQue_, spatialShapeQue_; - TBuf topGradMcMsFeatQue_, vQue_, featureQue_, featureQue__, pointGradWeightQue_, gradSamplingQue_, sumTmp_; + TBuf gradWeightsQue_, gradSamplingQue_, gradValueQue_; + TBuf topGradMcMsFeatQue_, vQue_, featureQue_, pointGradWeightQue_, pointGradQue_, weightBrobQue_; + LocalTensor scaleStartLocation, spatialShape; + LocalTensor weight, gradOutput, samplingLocation; + LocalTensor gradWeightsLocal, gradSamplingLocal, gradValueLocal; + LocalTensor topGradMcMsFeatLocal, vLocal, featureLocal, pointGradWeightLocal, pointGradSum, weightBrobLocal; uint32_t usedCoreNum_, avgWeightNum_, tailWeightNum_, coreId; uint32_t totalTaskNum_, singleProcessTaskLen_, taskRepeatTimes; uint32_t pts_, cam_, scale_, group_, numEmbeds, numFeat, numAnchors, totalGroups; + uint32_t blockSize_, blockDataNum_; int64_t taskOffset; - TEventID cpInEvtID_, cpOutEvtID_, vToOutEvtID_, vToMTE2EvtID_, mte3ToVEvtID_; }; -__aicore__ inline void KernelDeformableAggregationGrad::Process() +template +__aicore__ inline void KernelDeformableAggregationGrad::Process() { + Prepare(); for (uint32_t i = 0; i < taskRepeatTimes; ++i) { uint32_t actualWeightNum = singleProcessTaskLen_; if (unlikely(i == taskRepeatTimes - 1)) { @@ -321,7 +324,7 @@ extern "C" __global__ __aicore__ void deformable_aggregation_grad( { GET_TILING_DATA(tiling_data, tiling); TPipe pipe; - KernelDeformableAggregationGrad op( + KernelDeformableAggregationGrad op( mc_ms_feat, spatial_shape, scale_start_index, @@ -336,4 +339,3 @@ extern "C" __global__ __aicore__ void deformable_aggregation_grad( ); op.Process(); } - \ No newline at end of file diff --git a/model_examples/SparseDrive/README.md b/model_examples/SparseDrive/README.md index 831f8295..299ecb7a 100644 --- a/model_examples/SparseDrive/README.md +++ b/model_examples/SparseDrive/README.md @@ -169,9 +169,9 @@ SparseDrive是一种基于稀疏化表征的端到端自动驾驶模型,基于 | 阶段 | 芯片 | 卡数 | global batch size | FPS | 平均step耗时(s) | amota | L2 | |:---------:|:---------------:|------|:------------------:|:----:|:--------------:|:---:|:---:| | stage1 | 竞品A | 8p | 64 | 41.0 | 1.561 | 0.3764 | - | -| stage1 | Atlas 800T A2 | 8p | 64 | 40.6 | 1.576 | 0.3864 | - | +| stage1 | Atlas 800T A2 | 8p | 64 | 46.3 | 1.382 | 0.3864 | - | | stage2 | 竞品A | 8p | 48 | 35.2 | 1.363 | - | 0.6280 | -| stage2 | Atlas 800T A2 | 8p | 48 | 33.2 | 1.445 | - | 0.6069 | +| stage2 | Atlas 800T A2 | 8p | 48 | 37.9 | 1.265 | - | 0.6069 | @@ -179,6 +179,8 @@ SparseDrive是一种基于稀疏化表征的端到端自动驾驶模型,基于 ## 变更 2025.04.27:首次发布。 +2025.07.07:瓶颈算子优化,刷新性能数据 + ## FAQ 暂无。 diff --git a/mx_driving/ops/npu_deformable_aggregation.py b/mx_driving/ops/npu_deformable_aggregation.py index d5ccbbb3..1940802d 100644 --- a/mx_driving/ops/npu_deformable_aggregation.py +++ b/mx_driving/ops/npu_deformable_aggregation.py @@ -21,11 +21,11 @@ class AdsDeformableAggregation(Function): if (torch.numel(mc_ms_feat) == 0 or torch.numel(weights) == 0): raise Exception("Erorr! Input Tensor can not be a empty Tensor.\n") - mc_ms_feat = mc_ms_feat.contiguous().float() + mc_ms_feat = mc_ms_feat.contiguous() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() - sampling_location = sampling_location.contiguous().float() - weights = weights.contiguous().float() + sampling_location = sampling_location.contiguous() + weights = weights.contiguous() output = mx_driving._C.npu_deformable_aggregation( mc_ms_feat, @@ -55,15 +55,14 @@ class AdsDeformableAggregation(Function): if (torch.numel(mc_ms_feat) == 0 or torch.numel(spatial_shape) == 0 or torch.numel(sampling_location) == 0): raise Exception("Erorr! Input Tensor can not be a empty Tensor.\n") - npu_device = mc_ms_feat.device - mc_ms_feat = mc_ms_feat.contiguous().float() + mc_ms_feat = mc_ms_feat.contiguous() spatial_shape = spatial_shape.contiguous().int() scale_start_index = scale_start_index.contiguous().int() - sampling_location = sampling_location.contiguous().float() - weights = weights.contiguous().float() + sampling_location = sampling_location.contiguous() + weights = weights.contiguous() grad_mc_ms_feat = torch.zeros_like(mc_ms_feat) - grad_sampling_location_padding = torch.zeros((sampling_location.shape[0], sampling_location.shape[1], sampling_location.shape[2], sampling_location.shape[3], 8), device=npu_device) + grad_sampling_location = torch.zeros_like(sampling_location) grad_weights = torch.zeros_like(weights) grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_backward( mc_ms_feat, @@ -73,10 +72,9 @@ class AdsDeformableAggregation(Function): weights, grad_output.contiguous(), grad_mc_ms_feat, - grad_sampling_location_padding, + grad_sampling_location, grad_weights, ) - grad_sampling_location, _, _, _ = torch.chunk(grad_sampling_location_padding, 4, dim=-1) return ( grad_mc_ms_feat, None, -- Gitee