From 2e5403353281d1a093a8faba28b0630b84146749 Mon Sep 17 00:00:00 2001
From: huangyuan <huangyuan64@huawei.com>
Date: Mon, 12 May 2025 15:50:48 +0800
Subject: [PATCH] deformable aggregation ops optim and fp16 adapt

---
 kernels/op_host/deformable_aggregation.cpp    |  51 ++--
 .../op_host/deformable_aggregation_grad.cpp   |  62 ++--
 .../op_host/deformable_aggregation_tiling.h   |   1 -
 kernels/op_kernel/deformable_aggregation.cpp  | 137 ++++-----
 .../op_kernel/deformable_aggregation_grad.cpp | 264 +++++++++---------
 model_examples/SparseDrive/README.md          |   6 +-
 mx_driving/ops/npu_deformable_aggregation.py  |  18 +-
 7 files changed, 272 insertions(+), 267 deletions(-)

diff --git a/kernels/op_host/deformable_aggregation.cpp b/kernels/op_host/deformable_aggregation.cpp
index 71233b6d..0f13236b 100644
--- a/kernels/op_host/deformable_aggregation.cpp
+++ b/kernels/op_host/deformable_aggregation.cpp
@@ -23,6 +23,7 @@ namespace {
 constexpr uint32_t SINGLE = 1;
 constexpr uint32_t BYTE_BLOCK = 32;
 constexpr uint32_t SIZE_OF_FP32 = 4;
+constexpr uint32_t SIZE_OF_FP16 = 2;
 constexpr uint32_t BATCH_SIZE_IDX = 0;
 constexpr uint32_t FEAT_IDX = 1;
 constexpr uint32_t EMBEDS_IDX = 2;
@@ -53,8 +54,6 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte
         return ge::GRAPH_FAILED;
     }
 
-    auto dtype = context->GetInputDesc(0)->GetDataType();
-
     auto attrs = context->GetAttrs();
     if (attrs == nullptr) {
         return ge::GRAPH_FAILED;
@@ -76,16 +75,11 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte
     auto numScales = getAttr(SCALE_IDX);
     auto numGroups = getAttr(GROUPS_IDX);
 
-    uint32_t alignNum = BYTE_BLOCK / SIZE_OF_FP32;
+    bool dtype = context->GetInputDesc(0)->GetDataType() == ge::DT_FLOAT;
+    uint32_t dataByteNum = dtype ? SIZE_OF_FP32 : SIZE_OF_FP16;
+    uint32_t alignNum = BYTE_BLOCK / dataByteNum;
     uint32_t cAligned = CeilAlign(static_cast<uint32_t>(numEmbeds), alignNum);
 
-    uint64_t ubSize;
-    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
-    // 计算除weightBuf_所占空间以外的其他ub大小，并流出预留量(16 * 1024)
-    uint64_t usedUbSize = (16 * 1024 + 6 * cAligned + numPoints * numCams * 2 + numCams * numScales * 3) * SIZE_OF_FP32;
-    // 判断weightBuf_是否能放下包括numPoints大小的数据，分情况在不同位置进行数据搬运
-    bool memoryFlag = (ubSize - usedUbSize) > numPoints * numCams * numScales * numGroups * SIZE_OF_FP32;
-
     context->SetBlockDim(coreNum);
 
     tiling.set_bs(bs);
@@ -97,7 +91,6 @@ static ge::graphStatus TilingForDeformableAggregation(gert::TilingContext* conte
     tiling.set_numScales(numScales);
     tiling.set_numGroups(numGroups);
     tiling.set_cAligned(cAligned);
-    tiling.set_memoryFlag(memoryFlag);
     tiling.set_coreNum(coreNum);
 
     if (context->GetRawTilingData() == nullptr) {
@@ -151,34 +144,34 @@ public:
     {
         this->Input("mc_ms_feat")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("spatial_shape")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("scale_start_index")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("sampling_location")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("weights")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("out")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
 
         this->Attr("batch_size").AttrType(REQUIRED).Int();
         this->Attr("num_feat").AttrType(REQUIRED).Int();
diff --git a/kernels/op_host/deformable_aggregation_grad.cpp b/kernels/op_host/deformable_aggregation_grad.cpp
index caab5fe6..80ed3211 100644
--- a/kernels/op_host/deformable_aggregation_grad.cpp
+++ b/kernels/op_host/deformable_aggregation_grad.cpp
@@ -25,6 +25,7 @@ namespace {
 constexpr uint32_t SINGLE = 1;
 constexpr uint32_t BYTE_BLOCK = 32;
 constexpr uint32_t SIZE_OF_FP32 = 4;
+constexpr uint32_t SIZE_OF_FP16 = 2;
 
 const uint32_t INPUT_FEAT = 0;
 const uint32_t INPUT_SPATIAL_SHAPE = 1;
@@ -88,10 +89,13 @@ static ge::graphStatus TilingForDeformableAggregationGrad(gert::TilingContext* c
     uint32_t tailWeightNum = Tail(totalTask, avgWeightNum);
     usedCoreNum = Ceil(totalTask, avgWeightNum);
 
+    bool dtype = context->GetInputDesc(INPUT_FEAT)->GetDataType() == ge::DT_FLOAT;
+    uint32_t dataTypeSize = dtype ? SIZE_OF_FP32 : SIZE_OF_FP16;
+    
     uint64_t ubSize;
     ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
-    uint64_t usedUbSize = (10 * 1024 + 22 * numEmbeds + numCams * numScale * numGroups + numPoints * numCams * 10) * SIZE_OF_FP32;
-    uint32_t singleProcessTaskLen = (ubSize - usedUbSize) / SIZE_OF_FP32 / (numEmbeds);
+    uint64_t usedUbSize = (10 * 1024 + 15 * numEmbeds + 2 * numScale * numEmbeds + 2 * numScale * numGroups + 2 * numPoints * numCams) * dataTypeSize;
+    uint32_t singleProcessTaskLen = (ubSize - usedUbSize) / dataTypeSize / numEmbeds;
 
     context->SetBlockDim(usedCoreNum);
     tiling.set_usedCoreNum(usedCoreNum);
@@ -155,49 +159,49 @@ public:
     {
         this->Input("mc_ms_feat")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("spatial_shape")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("scale_start_index")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_INT32})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("sampling_location")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("weights")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Input("grad_output")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
        this->Output("grad_mc_ms_feat")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("grad_sampling_location")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("grad_weights")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
 
         this->SetInferShape(ge::InferShapeForDeformableAggregationGrad)
             .SetInferDataType(ge::InferDataTypeForDeformableAggregationGrad);
diff --git a/kernels/op_host/deformable_aggregation_tiling.h b/kernels/op_host/deformable_aggregation_tiling.h
index a14c7816..93d40c23 100644
--- a/kernels/op_host/deformable_aggregation_tiling.h
+++ b/kernels/op_host/deformable_aggregation_tiling.h
@@ -16,7 +16,6 @@ TILING_DATA_FIELD_DEF(uint32_t, numCams);
 TILING_DATA_FIELD_DEF(uint32_t, numScales);
 TILING_DATA_FIELD_DEF(uint32_t, numGroups);
 TILING_DATA_FIELD_DEF(uint32_t, cAligned);
-TILING_DATA_FIELD_DEF(uint32_t, memoryFlag);
 TILING_DATA_FIELD_DEF(uint32_t, coreNum);
 END_TILING_DATA_DEF;
 
diff --git a/kernels/op_kernel/deformable_aggregation.cpp b/kernels/op_kernel/deformable_aggregation.cpp
index 70152a96..df85cd65 100644
--- a/kernels/op_kernel/deformable_aggregation.cpp
+++ b/kernels/op_kernel/deformable_aggregation.cpp
@@ -23,20 +23,28 @@ public:
         numScales_ = tiling_data->numScales;
         numGroups_ = tiling_data->numGroups;
         cAligned_ = tiling_data->cAligned;
-        memoryFlag_ = tiling_data->memoryFlag;
         coreNum_ = tiling_data->coreNum;
         numChannels_ = numEmbeds_ / numGroups_;
 
-        weightBufSize_ = memoryFlag_ ? numPoints_ * numCams_ * numScales_ * numGroups_ : numCams_ * numScales_ * numGroups_;
-        weightBufSize_ = AlignUp(weightBufSize_, blockAlign_);
-        locBufSize_ = AlignUp(numPoints_ * numCams_ * 2, blockAlign_);
-        scaleStartBufSize_ = AlignUp(numCams_ * numScales_, blockAlign_);
-        spatialShapeBufSize_ = AlignUp(numCams_ * numScales_ * 2, blockAlign_);
+        uint32_t blockSize_ = 32;
+        blockAlignFloat_ = blockSize_ / sizeof(DTYPE_F);
+        blockAlignInt_ = blockSize_ / sizeof(DTYPE_I);
+
+        weightBufSize_ = AlignUp(numScales_ * numGroups_, blockAlignFloat_);
+        locBufSize_ = AlignUp(numPoints_ * numCams_ * 2, blockAlignFloat_);
+        scaleStartBufSize_ = AlignUp(numCams_ * numScales_, blockAlignInt_);
+        spatialShapeBufSize_ = AlignUp(numCams_ * numScales_ * 2, blockAlignInt_);
+        weightMulBufSize_ = numScales_ * cAligned_;
+
+        v1Offset_ = 0 * weightMulBufSize_;
+        v2Offset_ = 1 * weightMulBufSize_;
+        v3Offset_ = 2 * weightMulBufSize_;
+        v4Offset_ = 3 * weightMulBufSize_;
 
         copyOutParams_ = {1, static_cast<uint32_t>(numEmbeds_ * sizeof(DTYPE_F)), 0, 0, 0};
-        srcShape_[0] = numGroups_;
+        srcShape_[0] = numScales_ * numGroups_;
         srcShape_[1] = 1;
-        dstShape_[0] = numGroups_;
+        dstShape_[0] = numScales_ * numGroups_;
         dstShape_[1] = numChannels_;
 
         ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
@@ -62,8 +70,8 @@ public:
         pipe_->InitBuffer(locationBuf_, locBufSize_ * sizeof(DTYPE_F));
         pipe_->InitBuffer(scaleStartBuf_, scaleStartBufSize_ * sizeof(DTYPE_I));
         pipe_->InitBuffer(spatialShapeBuf_, spatialShapeBufSize_ * sizeof(DTYPE_I));
-        pipe_->InitBuffer(vBuf_, 4 * cAligned_ * sizeof(DTYPE_F));
-        pipe_->InitBuffer(weightMulBuf_, cAligned_ * sizeof(DTYPE_F));
+        pipe_->InitBuffer(vBuf_, 4 * weightMulBufSize_ * sizeof(DTYPE_F));
+        pipe_->InitBuffer(weightMulBuf_, 4 * weightMulBufSize_ * sizeof(DTYPE_F));
         pipe_->InitBuffer(resBuf_, cAligned_ * sizeof(DTYPE_F));
 
         weightLocal_ = weightBuf_.Get<DTYPE_F>();
@@ -86,8 +94,10 @@ public:
             endOffset_ = taskNum_;
         }
 
+        Duplicate(vLocal_, static_cast<DTYPE_F>(0.0f), 4 * weightMulBufSize_);
         DataCopy(scaleStartLocal_, scaleStartIndexGm_, scaleStartBufSize_);
         DataCopy(spatialShapeLocal_, spatialShapesGm_, spatialShapeBufSize_);
+
         for (uint32_t taskIdx = startOffset_; taskIdx < endOffset_; ++taskIdx) {
             ComputeAndCopyOut(taskIdx);
         }
@@ -100,56 +110,57 @@ public:
         uint64_t refOffsetGm = (batchIdx * numAnchors_ + anchorIdx) * numEmbeds_;
         uint64_t locationOffsetGm = (batchIdx * numAnchors_ +
                                      anchorIdx) * numPoints_ * numCams_ * 2;
-        if (memoryFlag_) {
-            uint64_t weightOffsetGm = (batchIdx * numAnchors_ +
-                                       anchorIdx) * numPoints_ * numCams_ * numScales_ * numGroups_;
-            SetFlag<HardEvent::V_MTE2>(0);
-            WaitFlag<HardEvent::V_MTE2>(0);
-            DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_);
-        }
         DataCopy(locationLocal_, samplingLocationGm_[locationOffsetGm], locBufSize_);
-        Duplicate(resLocal_, 0.0f, cAligned_);
+        Duplicate(resLocal_, static_cast<DTYPE_F>(0.0f), cAligned_);
         for (uint32_t pointIdx = 0; pointIdx < numPoints_; ++pointIdx) {
-            if (!memoryFlag_) {
-                uint64_t weightOffsetGm = (batchIdx * numAnchors_ * numPoints_ +
-                                           anchorIdx * numPoints_ + pointIdx) * numCams_ * numScales_ * numGroups_;
-                SetFlag<HardEvent::V_MTE2>(0);
-                WaitFlag<HardEvent::V_MTE2>(0);
-                DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_);
-            }
-            uint32_t weightBaseOffsetLocal = memoryFlag_ ? pointIdx * numCams_ * numScales_ * numGroups_ : 0;
             for (uint32_t camIdx = 0; camIdx < numCams_; ++camIdx) {
                 uint32_t locationOffsetLocal = (pointIdx * numCams_ + camIdx) * 2;
-                DTYPE_F locW = locationLocal_.GetValue(locationOffsetLocal);
+                float locW = locationLocal_.GetValue(locationOffsetLocal);
                 if (locW <= 0 || locW >= 1) {
                     continue;
                 }
-                DTYPE_F locH = locationLocal_.GetValue(locationOffsetLocal + 1);
+                float locH = locationLocal_.GetValue(locationOffsetLocal + 1);
                 if (locH <= 0 || locH >= 1) {
                     continue;
                 }
+                uint64_t weightOffsetGm = (((batchIdx * numAnchors_ + anchorIdx) * numPoints_ + pointIdx)
+                                          * numCams_ + camIdx) * numScales_ * numGroups_;
+                DataCopy(weightLocal_, weightsGm_[weightOffsetGm], weightBufSize_);
+                SetFlag<HardEvent::MTE2_V>(0);
+                WaitFlag<HardEvent::MTE2_V>(0);
+                BroadCast<DTYPE_F, 2, 1>(weightMulLocal_, weightLocal_, dstShape_, srcShape_);
+                SetFlag<HardEvent::V_MTE2>(0);
+                WaitFlag<HardEvent::V_MTE2>(0);
+                for (uint32_t i = 1; i < 4; ++i) {
+                    Adds(weightMulLocal_[i * weightMulBufSize_], weightMulLocal_, static_cast<DTYPE_F>(0.0f), weightMulBufSize_);
+                }
                 for (uint32_t scaleIdx = 0; scaleIdx < numScales_; ++scaleIdx) {
-                    uint32_t weightOffsetLocal = weightBaseOffsetLocal + (camIdx * numScales_ + scaleIdx) * numGroups_;
+                    uint32_t localOffset = scaleIdx * cAligned_;
                     uint32_t scaleStartOffset = camIdx * numScales_ + scaleIdx;
                     uint32_t spatialShapeOffset = scaleStartOffset * 2;
                     uint32_t scaleStartIdx = scaleStartLocal_.GetValue(scaleStartOffset);
                     uint32_t valueOffset = (batchIdx * numFeats_ + scaleStartIdx) * numEmbeds_;
 
+                    uint32_t localPtr1_ = v1Offset_ + localOffset;
+                    uint32_t localPtr2_ = v2Offset_ + localOffset;
+                    uint32_t localPtr3_ = v3Offset_ + localOffset;
+                    uint32_t localPtr4_ = v4Offset_ + localOffset;
+
                     DTYPE_I h = spatialShapeLocal_.GetValue(spatialShapeOffset);
                     DTYPE_I w = spatialShapeLocal_.GetValue(spatialShapeOffset + 1);
 
-                    DTYPE_F hIm = locH * h - 0.5f;
-                    DTYPE_F wIm = locW * w - 0.5f;
+                    float hIm = locH * h - 0.5f;
+                    float wIm = locW * w - 0.5f;
 
-                    DTYPE_I hLow = ScalarCast<DTYPE_F, DTYPE_I, RoundMode::CAST_FLOOR>(hIm);
-                    DTYPE_I wLow = ScalarCast<DTYPE_F, DTYPE_I, RoundMode::CAST_FLOOR>(wIm);
+                    DTYPE_I hLow = ScalarCast<float, DTYPE_I, RoundMode::CAST_FLOOR>(hIm);
+                    DTYPE_I wLow = ScalarCast<float, DTYPE_I, RoundMode::CAST_FLOOR>(wIm);
                     DTYPE_I hHigh = hLow + 1;
                     DTYPE_I wHigh = wLow + 1;
 
-                    DTYPE_F lh = hIm - hLow;
-                    DTYPE_F lw = wIm - wLow;
-                    DTYPE_F hh = 1 - lh;
-                    DTYPE_F hw = 1 - lw;
+                    float lh = hIm - hLow;
+                    float lw = wIm - wLow;
+                    float hh = 1 - lh;
+                    float hw = 1 - lw;
 
                     DTYPE_I wStride = numEmbeds_;
                     DTYPE_I hStride = w * wStride;
@@ -158,49 +169,46 @@ public:
                     DTYPE_I wLowPtrOffset = wLow * wStride;
                     DTYPE_I wHighPtrOffset = wLowPtrOffset + wStride;
 
-                    DTYPE_F w1 = hh * hw;
-                    DTYPE_F w2 = hh * lw;
-                    DTYPE_F w3 = lh * hw;
-                    DTYPE_F w4 = lh * lw;
-
-                    Duplicate(vLocal_, 0.0f, 4 * cAligned_);
-
-                    SetFlag<HardEvent::V_MTE2>(0);
-                    WaitFlag<HardEvent::V_MTE2>(0);
+                    float w1 = hh * hw;
+                    float w2 = hh * lw;
+                    float w3 = lh * hw;
+                    float w4 = lh * lw;
 
                     if (hLow >= 0) {
                         basePtr_ = valueOffset + hLowPtrOffset;
                         if (wLow >= 0) {
                             realPtr_ = basePtr_ + wLowPtrOffset;
-                            DataCopy(vLocal_[v1Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_);
+                            DataCopy(vLocal_[localPtr1_], mcMsFeatGm_[realPtr_], cAligned_);
                         }
                         if (wHigh <= w - 1) {
                             realPtr_ = basePtr_ + wHighPtrOffset;
-                            DataCopy(vLocal_[v2Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_);
+                            DataCopy(vLocal_[localPtr2_], mcMsFeatGm_[realPtr_], cAligned_);
                         }
                     }
-
                     if (hHigh <= h - 1) {
                         basePtr_ = valueOffset + hHighPtrOffset;
                         if (wLow >= 0) {
                             realPtr_ = basePtr_ + wLowPtrOffset;
-                            DataCopy(vLocal_[v3Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_);
+                            DataCopy(vLocal_[localPtr3_], mcMsFeatGm_[realPtr_], cAligned_);
                         }
                         if (wHigh <= w - 1) {
                             realPtr_ = basePtr_ + wHighPtrOffset;
-                            DataCopy(vLocal_[v4Offset_ * cAligned_], mcMsFeatGm_[realPtr_], cAligned_);
+                            DataCopy(vLocal_[localPtr4_], mcMsFeatGm_[realPtr_], cAligned_);
                         }
                     }
-
-                    SetFlag<HardEvent::MTE2_V>(0);
-                    WaitFlag<HardEvent::MTE2_V>(0);
-                    Muls(vLocal_[v1Offset_ * cAligned_], vLocal_[v1Offset_ * cAligned_], w1, cAligned_);
-                    Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v2Offset_ * cAligned_], w2, cAligned_);
-                    Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v3Offset_ * cAligned_], w3, cAligned_);
-                    Axpy(vLocal_[v1Offset_ * cAligned_], vLocal_[v4Offset_ * cAligned_], w4, cAligned_);
-                    
-                    BroadCast<DTYPE_F, 2, 1>(weightMulLocal_, weightLocal_[weightOffsetLocal], dstShape_, srcShape_);
-                    MulAddDst(resLocal_, vLocal_[v1Offset_ * cAligned_], weightMulLocal_, cAligned_);
+                    Muls(weightMulLocal_[localPtr1_], weightMulLocal_[localPtr1_], static_cast<DTYPE_F>(w1), cAligned_);
+                    Muls(weightMulLocal_[localPtr2_], weightMulLocal_[localPtr2_], static_cast<DTYPE_F>(w2), cAligned_);
+                    Muls(weightMulLocal_[localPtr3_], weightMulLocal_[localPtr3_], static_cast<DTYPE_F>(w3), cAligned_);
+                    Muls(weightMulLocal_[localPtr4_], weightMulLocal_[localPtr4_], static_cast<DTYPE_F>(w4), cAligned_);
+                }
+                SetFlag<HardEvent::MTE2_V>(0);
+                WaitFlag<HardEvent::MTE2_V>(0);
+                Mul(weightMulLocal_, weightMulLocal_, vLocal_, 4 * weightMulBufSize_);
+                Duplicate(vLocal_, static_cast<DTYPE_F>(0.0f), 4 * weightMulBufSize_);
+                SetFlag<HardEvent::V_MTE2>(0);
+                WaitFlag<HardEvent::V_MTE2>(0);
+                for (uint32_t i = 0; i < 4 * numScales_; ++i) {
+                    Add(resLocal_, resLocal_, weightMulLocal_[i * cAligned_], cAligned_);
                 }
             }
         }
@@ -224,13 +232,12 @@ private:
     LocalTensor<DTYPE_I> spatialShapeLocal_, scaleStartLocal_;
     LocalTensor<DTYPE_F> vLocal_, weightMulLocal_, resLocal_;
 
-    bool memoryFlag_;
     uint32_t basePtr_, realPtr_;
     uint32_t coreNum_, curBlockIdx_;
     uint32_t taskNum_, taskNumPerCore_, startOffset_, endOffset_;
-    uint32_t weightBufSize_, locBufSize_, scaleStartBufSize_, spatialShapeBufSize_;
+    uint32_t weightBufSize_, locBufSize_, scaleStartBufSize_, spatialShapeBufSize_, weightMulBufSize_;
     uint32_t bs_, numFeats_, numEmbeds_, numAnchors_, numPoints_, numCams_, numScales_, numGroups_, numChannels_, cAligned_;
-    uint32_t blockAlign_ = 8;
+    uint32_t blockAlignFloat_, blockAlignInt_;
     uint32_t v1Offset_ = 0, v2Offset_ = 1, v3Offset_ = 2, v4Offset_ = 3;
     
     uint32_t srcShape_[2], dstShape_[2];
@@ -243,7 +250,7 @@ extern "C" __global__ __aicore__ void deformable_aggregation(GM_ADDR mc_ms_feat,
 {
     TPipe pipe;
     GET_TILING_DATA(tiling_data, tiling);
-    KernelDeformableAggregation<float, int32_t> op;
+    KernelDeformableAggregation<DTYPE_MC_MS_FEAT, int32_t> op;
     op.Init(mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights, out, &tiling_data, &pipe);
     op.GetLocalTensor();
     op.Process();
diff --git a/kernels/op_kernel/deformable_aggregation_grad.cpp b/kernels/op_kernel/deformable_aggregation_grad.cpp
index ef4f0784..e7371449 100644
--- a/kernels/op_kernel/deformable_aggregation_grad.cpp
+++ b/kernels/op_kernel/deformable_aggregation_grad.cpp
@@ -6,6 +6,7 @@
 #include "kernel_tiling/kernel_tiling.h"
 using namespace AscendC;
 
+template<typename DTYPE_F>
 class KernelDeformableAggregationGrad {
 public:
     __aicore__ inline KernelDeformableAggregationGrad() = delete;
@@ -29,7 +30,6 @@ public:
             sampling_location, weights, grad_output,
             grad_mc_ms_feat, grad_sampling_location, grad_weights);
         InitBuffer();
-        InitEvent();
     }
 
     __aicore__ inline void Process();
@@ -46,7 +46,8 @@ private:
         if (coreId == usedCoreNum_ - 1) {
             totalTaskNum_ = tailWeightNum_;
         }
-        singleProcessTaskLen_ = tiling.singleProcessTaskLen;
+        singleProcessTaskLen_ = min(tiling.singleProcessTaskLen, totalTaskNum_);
+        singleProcessTaskLen_ = max(singleProcessTaskLen_, (uint32_t)1);
         taskRepeatTimes = (totalTaskNum_ - 1) / singleProcessTaskLen_ + 1;
         pts_ = tiling.numPoints;
         cam_  = tiling.numCams;
@@ -56,6 +57,9 @@ private:
         numFeat = tiling.numFeat;
         numAnchors = tiling.numAnchors;
         totalGroups = numEmbeds / group_;
+
+        blockSize_ = 32;
+        blockDataNum_ = blockSize_ / sizeof(DTYPE_F);
     }
 
     __aicore__ inline void InitGM(GM_ADDR mc_ms_feat, GM_ADDR spatial_shape, GM_ADDR scale_start_index,
@@ -64,89 +68,84 @@ private:
     {
         int64_t samplingLocationOffset = taskOffset * pts_ * cam_ * 2;
         int64_t weightOffset = taskOffset * pts_ * cam_ * scale_ * group_;
-        mcMsFeatGm.SetGlobalBuffer((__gm__ float*)(mc_ms_feat));
+        mcMsFeatGm.SetGlobalBuffer((__gm__ DTYPE_F*)(mc_ms_feat));
         spatialShapeGm.SetGlobalBuffer((__gm__ int32_t*)(spatial_shape));
         scaleStartLocationGm.SetGlobalBuffer((__gm__ int32_t*)(scale_start_index));
-        samplingLocationGm.SetGlobalBuffer((__gm__ float*)(sampling_location) + samplingLocationOffset);
-        weightGm.SetGlobalBuffer((__gm__ float*)(weights) + weightOffset);
-        outputGradGm.SetGlobalBuffer((__gm__ float*)(grad_output) + taskOffset * numEmbeds);
-        gradMcMsFeatGm.SetGlobalBuffer((__gm__ float*)(grad_mc_ms_feat));
-        gradSamplingLocationGm.SetGlobalBuffer((__gm__ float*)(grad_sampling_location) + samplingLocationOffset * 4);
-        gradWeightsGm.SetGlobalBuffer((__gm__ float*)(grad_weights) + weightOffset);
+        samplingLocationGm.SetGlobalBuffer((__gm__ DTYPE_F*)(sampling_location) + samplingLocationOffset);
+        weightGm.SetGlobalBuffer((__gm__ DTYPE_F*)(weights) + weightOffset);
+        outputGradGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_output) + taskOffset * numEmbeds);
+        gradMcMsFeatGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_mc_ms_feat));
+        gradSamplingLocalGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_sampling_location) + samplingLocationOffset);
+        gradWeightsGm.SetGlobalBuffer((__gm__ DTYPE_F*)(grad_weights) + weightOffset);
     }
 
     __aicore__ inline void InitBuffer()
     {
-        uint64_t singleWeightOffset = cam_ * scale_ * group_;
+        uint64_t singleWeightOffset = scale_ * group_;
         uint64_t samplingOffset = pts_ * cam_ * 2;
-        pipe_->InitBuffer(weightQue_, AlignUp(singleWeightOffset, B32_DATA_NUM_PER_BLOCK) * sizeof(float));
-        pipe_->InitBuffer(gradOutputQue_, singleProcessTaskLen_ * numEmbeds * sizeof(float));
+        pipe_->InitBuffer(weightQue_, AlignUp(singleWeightOffset, blockDataNum_) * sizeof(DTYPE_F));
+        pipe_->InitBuffer(gradOutputQue_, singleProcessTaskLen_ * numEmbeds * sizeof(DTYPE_F));
         pipe_->InitBuffer(scaleStartLocationQue_, AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t));
-        pipe_->InitBuffer(samplingLocationQue_, AlignUp(samplingOffset, B32_DATA_NUM_PER_BLOCK) * sizeof(float));
-        pipe_->InitBuffer(spatialShapeQue_, AlignUp(cam_ * scale_* 2, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t));
-        pipe_->InitBuffer(topGradMcMsFeatQue_, 5 * numEmbeds * sizeof(float));
-        pipe_->InitBuffer(vQue_, 4 * numEmbeds * sizeof(float));
-        pipe_->InitBuffer(featureQue_, 4 * numEmbeds * sizeof(float));
-        pipe_->InitBuffer(featureQue__, numEmbeds * sizeof(float));
-        pipe_->InitBuffer(pointGradWeightQue_, 8 * numEmbeds * sizeof(float));
-        pipe_->InitBuffer(gradSamplingQue_, 4 * samplingOffset * sizeof(float));
-        pipe_->InitBuffer(sumTmp_, 8 * sizeof(float));
+        pipe_->InitBuffer(samplingLocationQue_, AlignUp(samplingOffset, blockDataNum_) * sizeof(DTYPE_F));
+        pipe_->InitBuffer(spatialShapeQue_, AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK) * sizeof(int32_t));
+        pipe_->InitBuffer(topGradMcMsFeatQue_, numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(gradValueQue_, 4 * numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(vQue_, 4 * numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(featureQue_, scale_ * numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(gradWeightsQue_, scale_ * group_ * sizeof(DTYPE_F));
+        pipe_->InitBuffer(pointGradWeightQue_, 4 * numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(gradSamplingQue_, blockDataNum_ * sizeof(DTYPE_F));
+        pipe_->InitBuffer(pointGradQue_, 2 * numEmbeds * sizeof(DTYPE_F));
+        pipe_->InitBuffer(weightBrobQue_, scale_ * numEmbeds * sizeof(DTYPE_F));
     }
 
-    __aicore__ inline void InitEvent()
+    __aicore__ inline void Prepare()
     {
-        cpInEvtID_ = pipe_->FetchEventID(HardEvent::MTE2_V);
-        cpOutEvtID_ = pipe_->FetchEventID(HardEvent::MTE3_MTE2);
-        vToOutEvtID_ = pipe_->FetchEventID(HardEvent::V_MTE3);
-        vToMTE2EvtID_ = pipe_->FetchEventID(HardEvent::V_MTE2);
-        mte3ToVEvtID_ = pipe_->FetchEventID(HardEvent::MTE3_V);
+        int32_t scaleStartNum = AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK);
+        int32_t spatialShapeNum = AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK);
+        scaleStartLocation = scaleStartLocationQue_.Get<int32_t>();
+        spatialShape = spatialShapeQue_.Get<int32_t>();
+        weight = weightQue_.Get<DTYPE_F>();
+        gradOutput = gradOutputQue_.Get<DTYPE_F>();
+        samplingLocation = samplingLocationQue_.Get<DTYPE_F>();
+
+        gradWeightsLocal = gradWeightsQue_.Get<DTYPE_F>();
+        gradSamplingLocal = gradSamplingQue_.Get<DTYPE_F>();
+        gradValueLocal = gradValueQue_.Get<DTYPE_F>();
+
+        topGradMcMsFeatLocal = topGradMcMsFeatQue_.Get<DTYPE_F>();
+        vLocal = vQue_.Get<DTYPE_F>();
+        featureLocal = featureQue_.Get<DTYPE_F>();
+        pointGradWeightLocal = pointGradWeightQue_.Get<DTYPE_F>();
+        pointGradSum = pointGradQue_.Get<DTYPE_F>();
+        weightBrobLocal = weightBrobQue_.Get<DTYPE_F>();
+
+        Duplicate(pointGradSum, (DTYPE_F)0, 2 * numEmbeds);
+        Duplicate(featureLocal, (DTYPE_F)0, scale_ * numEmbeds);
+        Duplicate(vLocal, (DTYPE_F)0, numEmbeds * 4);
+
+        DataCopy(scaleStartLocation, scaleStartLocationGm, scaleStartNum);
+        DataCopy(spatialShape, spatialShapeGm, spatialShapeNum);
     }
 
     __aicore__ inline void ProcessSingle(uint64_t taskIdx, uint32_t actualWeightNum)
     {
-        uint64_t singleWeightOffset = cam_ * scale_ * group_;
-        uint32_t weightCopyLen = AlignUp(singleWeightOffset, B32_DATA_NUM_PER_BLOCK);
-        int32_t gradOuputNum = AlignUp(actualWeightNum * numEmbeds, B32_DATA_NUM_PER_BLOCK);
-        int32_t samplingLocationNum = AlignUp(pts_ * cam_ * 2, B32_DATA_NUM_PER_BLOCK);
-        int32_t scaleStartNum = AlignUp(cam_ * scale_, B32_DATA_NUM_PER_BLOCK);
-        int32_t spatialShapeNum = AlignUp(cam_ * scale_ * 2, B32_DATA_NUM_PER_BLOCK);
+        uint64_t singleWeightOffset = scale_ * group_;
+        uint32_t weightCopyLen = AlignUp(singleWeightOffset, blockDataNum_);
+        int32_t gradOuputNum = AlignUp(actualWeightNum * numEmbeds, blockDataNum_);
+        int32_t samplingLocationNum = AlignUp(pts_ * cam_ * 2, blockDataNum_);
         uint64_t gradOutputOffset = taskIdx * singleProcessTaskLen_ * numEmbeds;
 
-        LocalTensor<float> weight = weightQue_.Get<float>();
-        LocalTensor<float> gradOutput = gradOutputQue_.Get<float>();
-        LocalTensor<float> samplingLocation = samplingLocationQue_.Get<float>();
-        LocalTensor<int32_t> scaleStartLocation = scaleStartLocationQue_.Get<int32_t>();
-        LocalTensor<int32_t> spatialShape = spatialShapeQue_.Get<int32_t>();
-
-        LocalTensor<float> topGradMcMsFeatLocal = topGradMcMsFeatQue_.Get<float>();
-        LocalTensor<float> vLocal = vQue_.Get<float>();
-        LocalTensor<float> featureLocal = featureQue_.Get<float>();
-        LocalTensor<float> featureLocal_ = featureQue__.Get<float>();
-        LocalTensor<float> pointGradWeightLocal = pointGradWeightQue_.Get<float>();
-        LocalTensor<float> gradSamplingLocation = gradSamplingQue_.Get<float>();
-        LocalTensor<float> tmpLocation = sumTmp_.Get<float>();
-
-        SetFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
-        WaitFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
+        SetFlag<HardEvent::V_MTE2>(0);
+        WaitFlag<HardEvent::V_MTE2>(0);
         DataCopy(gradOutput, outputGradGm[gradOutputOffset], gradOuputNum);
-        
-        DataCopy(scaleStartLocation, scaleStartLocationGm, scaleStartNum);
-        DataCopy(spatialShape, spatialShapeGm, spatialShapeNum);
+
         for (int32_t weightNumId = 0; weightNumId < actualWeightNum; weightNumId++) {
             int64_t curBatch = (taskOffset + taskIdx * singleProcessTaskLen_ + weightNumId)  / numAnchors;
             int64_t featOffset = curBatch * numFeat * numEmbeds;
             uint64_t samplingLocationOffset = (taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ * cam_ * 2;
             DataCopy(samplingLocation, samplingLocationGm[samplingLocationOffset], samplingLocationNum);
-            SetFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
-            WaitFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
-            Duplicate(gradSamplingLocation, (float)0, pts_ * cam_ * 8);
             for (int32_t ptsId = 0; ptsId < pts_; ptsId++) {
-                uint64_t weightGmOffset = ((taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ + ptsId) * singleWeightOffset;
-                SetFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
-                WaitFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
-                DataCopy(weight, weightGm[weightGmOffset], weightCopyLen);
-                SetFlag<HardEvent::MTE2_V>(cpInEvtID_);
-                WaitFlag<HardEvent::MTE2_V>(cpInEvtID_);
                 for (int32_t camId = 0; camId < cam_; camId++) {
                     int32_t locOffset = ptsId * cam_ + camId;
                     float locW = samplingLocation.GetValue(locOffset * 2);
@@ -154,6 +153,16 @@ private:
                     if (locW <= 0 || locW >= 1 ||  locH <=0 || locH >=1) {
                         continue;
                     }
+                    uint64_t weightGmOffset = (((taskIdx * singleProcessTaskLen_ + weightNumId) * pts_ + ptsId) * cam_ + camId) * singleWeightOffset;
+                    uint64_t samplingLocationCopyOutOffset = samplingLocationOffset + (ptsId * cam_ + camId) * 2;
+                    DataCopy(weight, weightGm[weightGmOffset], weightCopyLen);
+                    SetFlag<HardEvent::MTE2_V>(0);
+                    WaitFlag<HardEvent::MTE2_V>(0);
+                    uint32_t dstShape_[2] = {scale_ * group_, totalGroups};
+                    uint32_t srcShape_[2] = {scale_ * group_, 1};
+                    BroadCast<DTYPE_F, 2, 1>(weightBrobLocal, weight, dstShape_, srcShape_);
+                    SetFlag<HardEvent::V_MTE2>(0);
+                    WaitFlag<HardEvent::V_MTE2>(0);
                     for (int32_t scaleId = 0; scaleId < scale_; scaleId++) {
                         int32_t scaleStartOffset = camId * scale_ + scaleId;
                         int32_t scaleStartIdx = scaleStartLocation.GetValue(scaleStartOffset);
@@ -185,118 +194,112 @@ private:
                         uint64_t ptr3 = featureOffset + hHighPtrOffset + wLowPtrOffset;
                         uint64_t ptr4 = featureOffset + hHighPtrOffset + wHighPtrOffset;
 
-                        uint64_t weightOffset = (camId * scale_ + scaleId) * group_;
+                        uint64_t weightOffset = scaleId * numEmbeds;
                         uint64_t gradOuputBaseOffset = weightNumId * numEmbeds;
-                        uint32_t dstShape_[2] = {group_, totalGroups};
-                        uint32_t srcShape_[2] = {group_, 1};
-
-                        Duplicate(vLocal, (float)0, numEmbeds * 4);
 
-                        SetFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
-                        WaitFlag<HardEvent::V_MTE2>(vToMTE2EvtID_);
+                        SetFlag<HardEvent::MTE3_V>(0);
+                        WaitFlag<HardEvent::MTE3_V>(0);
 
-                        SetFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
-                        WaitFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
+                        Mul(topGradMcMsFeatLocal, weightBrobLocal[weightOffset], gradOutput[gradOuputBaseOffset], numEmbeds);
+                        Muls(gradValueLocal, topGradMcMsFeatLocal, static_cast<DTYPE_F>(w1), numEmbeds);
+                        Muls(gradValueLocal[numEmbeds * 1], topGradMcMsFeatLocal, static_cast<DTYPE_F>(w2), numEmbeds);
+                        Muls(gradValueLocal[numEmbeds * 2], topGradMcMsFeatLocal, static_cast<DTYPE_F>(w3), numEmbeds);
+                        Muls(gradValueLocal[numEmbeds * 3], topGradMcMsFeatLocal, static_cast<DTYPE_F>(w4), numEmbeds);
 
-                        BroadCast<float, 2, 1>(topGradMcMsFeatLocal, weight[weightOffset], dstShape_, srcShape_);
-                        Mul(topGradMcMsFeatLocal, topGradMcMsFeatLocal, gradOutput[gradOuputBaseOffset], numEmbeds);
-                        Muls(topGradMcMsFeatLocal[numEmbeds], topGradMcMsFeatLocal, w1, numEmbeds);
-                        Muls(topGradMcMsFeatLocal[numEmbeds * 2], topGradMcMsFeatLocal, w2, numEmbeds);
-                        Muls(topGradMcMsFeatLocal[numEmbeds * 3], topGradMcMsFeatLocal, w3, numEmbeds);
-                        Muls(topGradMcMsFeatLocal[numEmbeds * 4], topGradMcMsFeatLocal, w4, numEmbeds);
+                        SetFlag<HardEvent::V_MTE3>(0);
+                        WaitFlag<HardEvent::V_MTE3>(0);
 
-                        SetFlag<HardEvent::V_MTE3>(vToOutEvtID_);
-                        WaitFlag<HardEvent::V_MTE3>(vToOutEvtID_);
-
-                        SetAtomicAdd<float>();
+                        SetAtomicAdd<DTYPE_F>();
                         if (hLow >= 0 && wLow >=0) {
-                            DataCopy(gradMcMsFeatGm[featOffset + ptr1], topGradMcMsFeatLocal[numEmbeds], numEmbeds);
+                            DataCopy(gradMcMsFeatGm[featOffset + ptr1], gradValueLocal, numEmbeds);
                             DataCopy(vLocal, mcMsFeatGm[featOffset + ptr1], numEmbeds);
                         }
                         if (hLow >= 0 && wHigh <= w - 1) {
-                            DataCopy(gradMcMsFeatGm[featOffset + ptr2], topGradMcMsFeatLocal[numEmbeds * 2], numEmbeds);
+                            DataCopy(gradMcMsFeatGm[featOffset + ptr2], gradValueLocal[numEmbeds * 1], numEmbeds);
                             DataCopy(vLocal[numEmbeds], mcMsFeatGm[featOffset + ptr2], numEmbeds);
                         }
                         if (hHigh <= h - 1 && wLow >= 0) {
-                            DataCopy(gradMcMsFeatGm[featOffset + ptr3], topGradMcMsFeatLocal[numEmbeds * 3], numEmbeds);
+                            DataCopy(gradMcMsFeatGm[featOffset + ptr3], gradValueLocal[numEmbeds * 2], numEmbeds);
                             DataCopy(vLocal[numEmbeds * 2], mcMsFeatGm[featOffset + ptr3], numEmbeds);
                         }
                         if (hHigh <= h - 1 && wHigh <= w - 1) {
-                            DataCopy(gradMcMsFeatGm[featOffset + ptr4], topGradMcMsFeatLocal[numEmbeds * 4], numEmbeds);
+                            DataCopy(gradMcMsFeatGm[featOffset + ptr4], gradValueLocal[numEmbeds * 3], numEmbeds);
                             DataCopy(vLocal[numEmbeds * 3], mcMsFeatGm[featOffset + ptr4], numEmbeds);
                         }
                         SetAtomicNone();
 
-                        SetFlag<HardEvent::MTE2_V>(cpInEvtID_);
-                        WaitFlag<HardEvent::MTE2_V>(cpInEvtID_);
+                        SetFlag<HardEvent::MTE2_V>(0);
+                        WaitFlag<HardEvent::MTE2_V>(0);
 
-                        Muls(featureLocal, vLocal, w1, numEmbeds);
-                        Muls(featureLocal[numEmbeds], vLocal[numEmbeds], w2, numEmbeds);
-                        Muls(featureLocal[numEmbeds * 2], vLocal[numEmbeds * 2], w3, numEmbeds);
-                        Muls(featureLocal[numEmbeds * 3], vLocal[numEmbeds * 3], w4, numEmbeds);
-                        Add(featureLocal, featureLocal, featureLocal[numEmbeds], numEmbeds);
-                        Add(featureLocal[numEmbeds * 2], featureLocal[numEmbeds * 2], featureLocal[numEmbeds * 3], numEmbeds);
-                        Add(featureLocal, featureLocal, featureLocal[numEmbeds * 2], numEmbeds);
-                        Mul(featureLocal, featureLocal, gradOutput[gradOuputBaseOffset], numEmbeds);
+                        Muls(featureLocal[weightOffset], vLocal, static_cast<DTYPE_F>(w1), numEmbeds);
+                        Axpy(featureLocal[weightOffset], vLocal[numEmbeds], static_cast<DTYPE_F>(w2), numEmbeds);
+                        Axpy(featureLocal[weightOffset], vLocal[numEmbeds * 2], static_cast<DTYPE_F>(w3), numEmbeds);
+                        Axpy(featureLocal[weightOffset], vLocal[numEmbeds * 3], static_cast<DTYPE_F>(w4), numEmbeds);
+                        Mul(featureLocal[weightOffset], featureLocal[weightOffset], gradOutput[gradOuputBaseOffset], numEmbeds);
 
-                        SetFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
-                        WaitFlag<HardEvent::MTE3_V>(mte3ToVEvtID_);
+                        Sub(pointGradWeightLocal, vLocal[numEmbeds * 1], vLocal, numEmbeds);
+                        Sub(pointGradWeightLocal[numEmbeds * 2], vLocal[numEmbeds * 3], vLocal[numEmbeds * 2], numEmbeds);
 
-                        Sum(featureLocal_, featureLocal, {group_, totalGroups, totalGroups});
+                        Sub(pointGradWeightLocal[numEmbeds * 1], vLocal[numEmbeds * 2], vLocal, numEmbeds);
+                        Sub(pointGradWeightLocal[numEmbeds * 3], vLocal[numEmbeds * 3], vLocal[numEmbeds * 1], numEmbeds);
+                        Duplicate(vLocal, (DTYPE_F)0, numEmbeds * 4);
 
-                        SetFlag<HardEvent::V_MTE3>(vToOutEvtID_);
-                        WaitFlag<HardEvent::V_MTE3>(vToOutEvtID_);
+                        SetFlag<HardEvent::V_MTE2>(0);
+                        WaitFlag<HardEvent::V_MTE2>(0);
 
-                        SetAtomicAdd<float>();
-                        DataCopy(gradWeightsGm[weightGmOffset + weightOffset], featureLocal_, group_);
-                        SetAtomicNone();
+                        Muls(pointGradWeightLocal, pointGradWeightLocal, static_cast<DTYPE_F>(hh), numEmbeds);
+                        Axpy(pointGradWeightLocal, pointGradWeightLocal[numEmbeds * 2], static_cast<DTYPE_F>(lh), numEmbeds);
+
+                        Muls(pointGradWeightLocal[numEmbeds * 1], pointGradWeightLocal[numEmbeds * 1], static_cast<DTYPE_F>(hw), numEmbeds);
+                        Axpy(pointGradWeightLocal[numEmbeds * 1], pointGradWeightLocal[numEmbeds * 3], static_cast<DTYPE_F>(lw), numEmbeds);
 
-                        Muls(pointGradWeightLocal, vLocal, hw, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 2], vLocal[numEmbeds], lw, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 4], vLocal[numEmbeds * 2], hw, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 6], vLocal[numEmbeds * 3], lw, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds], vLocal, hh, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 3], vLocal[numEmbeds], hh, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 5], vLocal[numEmbeds * 2], lh, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds * 7], vLocal[numEmbeds * 3], lh, numEmbeds);
-                        Sub(pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal, numEmbeds);
-                        Sub(pointGradWeightLocal[numEmbeds * 6], pointGradWeightLocal[numEmbeds * 6], pointGradWeightLocal[numEmbeds * 2], numEmbeds);
-                        Sub(pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds], numEmbeds);
-                        Sub(pointGradWeightLocal[numEmbeds * 7], pointGradWeightLocal[numEmbeds * 7], pointGradWeightLocal[numEmbeds * 5], numEmbeds);
-                        Add(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds * 4], pointGradWeightLocal[numEmbeds * 6], numEmbeds);
-                        Add(pointGradWeightLocal, pointGradWeightLocal[numEmbeds * 3], pointGradWeightLocal[numEmbeds * 7], numEmbeds);
                         Mul(pointGradWeightLocal, pointGradWeightLocal, topGradMcMsFeatLocal, numEmbeds);
                         Mul(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], topGradMcMsFeatLocal, numEmbeds);
-                        Muls(pointGradWeightLocal, pointGradWeightLocal, (float)w, numEmbeds);
-                        Muls(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], (float)h, numEmbeds);
-                        Sum(tmpLocation, pointGradWeightLocal, {2, numEmbeds, numEmbeds});
-                        Add(gradSamplingLocation[locOffset * 8], gradSamplingLocation[locOffset * 8], tmpLocation, 8);
+                        Muls(pointGradWeightLocal, pointGradWeightLocal, (DTYPE_F)w, numEmbeds);
+                        Muls(pointGradWeightLocal[numEmbeds], pointGradWeightLocal[numEmbeds], (DTYPE_F)h, numEmbeds);
+
+                        Add(pointGradSum, pointGradSum, pointGradWeightLocal, numEmbeds * 2);
                     }
+                    SetFlag<HardEvent::MTE3_V>(0);
+                    WaitFlag<HardEvent::MTE3_V>(0);
+                    Sum(gradWeightsLocal, featureLocal, {scale_ * group_, totalGroups, totalGroups});
+                    Sum(gradSamplingLocal, pointGradSum, {2, numEmbeds, numEmbeds});
+                    SetFlag<HardEvent::V_MTE3>(0);
+                    WaitFlag<HardEvent::V_MTE3>(0);
+                    Duplicate(featureLocal, (DTYPE_F)0, scale_ * numEmbeds);
+                    Duplicate(pointGradSum, (DTYPE_F)0, 2 * numEmbeds);
+                    DataCopyExtParams locationCopyParams {1, (uint32_t)(2 * sizeof(DTYPE_F)), 0, 0, 0};
+                    DataCopyExtParams weightsCopyParams {1, (uint32_t)(scale_ * group_ * sizeof(DTYPE_F)), 0, 0, 0};
+                    DataCopyPad(gradSamplingLocalGm[samplingLocationCopyOutOffset], gradSamplingLocal, locationCopyParams);
+                    DataCopyPad(gradWeightsGm[weightGmOffset], gradWeightsLocal, weightsCopyParams);
                 }
             }
-            SetFlag<HardEvent::V_MTE3>(vToOutEvtID_);
-            WaitFlag<HardEvent::V_MTE3>(vToOutEvtID_);
-            DataCopyExtParams copyParams {1, (uint32_t)(pts_ * cam_ * 8 * sizeof(float)), 0, 0, 0};
-            DataCopyPad(gradSamplingLocationGm[samplingLocationOffset * 4], gradSamplingLocation, copyParams);
         }
     }
 
 private:
     TPipe* pipe_;
-    GlobalTensor<float> mcMsFeatGm, samplingLocationGm, weightGm, outputGradGm;
-    GlobalTensor<float> gradMcMsFeatGm, gradSamplingLocationGm, gradWeightsGm;
+    GlobalTensor<DTYPE_F> mcMsFeatGm, samplingLocationGm, weightGm, outputGradGm;
+    GlobalTensor<DTYPE_F> gradMcMsFeatGm, gradSamplingLocalGm, gradWeightsGm;
     GlobalTensor<int32_t> spatialShapeGm, scaleStartLocationGm;
     TBuf<TPosition::VECCALC> weightQue_, gradOutputQue_, samplingLocationQue_, scaleStartLocationQue_, spatialShapeQue_;
-    TBuf<TPosition::VECCALC> topGradMcMsFeatQue_, vQue_, featureQue_, featureQue__, pointGradWeightQue_, gradSamplingQue_, sumTmp_;
+    TBuf<TPosition::VECCALC> gradWeightsQue_, gradSamplingQue_, gradValueQue_;
+    TBuf<TPosition::VECCALC> topGradMcMsFeatQue_, vQue_, featureQue_, pointGradWeightQue_, pointGradQue_, weightBrobQue_;
+    LocalTensor<int32_t> scaleStartLocation, spatialShape;
+    LocalTensor<DTYPE_F> weight, gradOutput, samplingLocation;
+    LocalTensor<DTYPE_F> gradWeightsLocal, gradSamplingLocal, gradValueLocal;
+    LocalTensor<DTYPE_F> topGradMcMsFeatLocal, vLocal, featureLocal, pointGradWeightLocal, pointGradSum, weightBrobLocal;
     uint32_t usedCoreNum_, avgWeightNum_, tailWeightNum_, coreId;
     uint32_t totalTaskNum_, singleProcessTaskLen_, taskRepeatTimes;
     uint32_t pts_, cam_, scale_, group_, numEmbeds, numFeat, numAnchors, totalGroups;
+    uint32_t blockSize_, blockDataNum_;
     int64_t taskOffset;
-    TEventID cpInEvtID_, cpOutEvtID_, vToOutEvtID_, vToMTE2EvtID_, mte3ToVEvtID_;
 };
 
-__aicore__ inline void KernelDeformableAggregationGrad::Process()
+template<typename DTYPE_F>
+__aicore__ inline void KernelDeformableAggregationGrad<DTYPE_F>::Process()
 {
+    Prepare();
     for (uint32_t i = 0; i < taskRepeatTimes; ++i) {
         uint32_t actualWeightNum = singleProcessTaskLen_;
         if (unlikely(i == taskRepeatTimes - 1)) {
@@ -321,7 +324,7 @@ extern "C" __global__ __aicore__ void deformable_aggregation_grad(
 {
     GET_TILING_DATA(tiling_data, tiling);
     TPipe pipe;
-    KernelDeformableAggregationGrad op(
+    KernelDeformableAggregationGrad<DTYPE_MC_MS_FEAT> op(
         mc_ms_feat,
         spatial_shape,
         scale_start_index,
@@ -336,4 +339,3 @@ extern "C" __global__ __aicore__ void deformable_aggregation_grad(
     );
     op.Process();
 }
- 
\ No newline at end of file
diff --git a/model_examples/SparseDrive/README.md b/model_examples/SparseDrive/README.md
index 831f8295..299ecb7a 100644
--- a/model_examples/SparseDrive/README.md
+++ b/model_examples/SparseDrive/README.md
@@ -169,9 +169,9 @@ SparseDrive是一种基于稀疏化表征的端到端自动驾驶模型，基于
 |     阶段   |      芯片       | 卡数 | global batch size  | FPS | 平均step耗时(s) | amota | L2 |
 |:---------:|:---------------:|------|:------------------:|:----:|:--------------:|:---:|:---:|
 |   stage1   |      竞品A      | 8p | 64 | 41.0 | 1.561 | 0.3764 | - |
-|   stage1   | Atlas 800T A2   | 8p | 64 | 40.6 | 1.576 | 0.3864 | - |
+|   stage1   | Atlas 800T A2   | 8p | 64 | 46.3 | 1.382 | 0.3864 | - |
 |   stage2   |      竞品A      | 8p | 48 | 35.2 | 1.363 | - | 0.6280 |
-|   stage2   | Atlas 800T A2   | 8p | 48 | 33.2 | 1.445 | - | 0.6069 |
+|   stage2   | Atlas 800T A2   | 8p | 48 | 37.9 | 1.265 | - | 0.6069 |
 
 
 
@@ -179,6 +179,8 @@ SparseDrive是一种基于稀疏化表征的端到端自动驾驶模型，基于
 ## 变更
 2025.04.27：首次发布。
 
+2025.07.07：瓶颈算子优化，刷新性能数据
+
 ## FAQ
 暂无。
 
diff --git a/mx_driving/ops/npu_deformable_aggregation.py b/mx_driving/ops/npu_deformable_aggregation.py
index d5ccbbb3..1940802d 100644
--- a/mx_driving/ops/npu_deformable_aggregation.py
+++ b/mx_driving/ops/npu_deformable_aggregation.py
@@ -21,11 +21,11 @@ class AdsDeformableAggregation(Function):
         if (torch.numel(mc_ms_feat) == 0 or torch.numel(weights) == 0):
             raise Exception("Erorr! Input Tensor can not be a empty Tensor.\n")
 
-        mc_ms_feat = mc_ms_feat.contiguous().float()
+        mc_ms_feat = mc_ms_feat.contiguous()
         spatial_shape = spatial_shape.contiguous().int()
         scale_start_index = scale_start_index.contiguous().int()
-        sampling_location = sampling_location.contiguous().float()
-        weights = weights.contiguous().float()
+        sampling_location = sampling_location.contiguous()
+        weights = weights.contiguous()
 
         output = mx_driving._C.npu_deformable_aggregation(
             mc_ms_feat,
@@ -55,15 +55,14 @@ class AdsDeformableAggregation(Function):
 
         if (torch.numel(mc_ms_feat) == 0 or torch.numel(spatial_shape) == 0 or torch.numel(sampling_location) == 0):
             raise Exception("Erorr! Input Tensor can not be a empty Tensor.\n")
-        npu_device = mc_ms_feat.device
-        mc_ms_feat = mc_ms_feat.contiguous().float()
+        mc_ms_feat = mc_ms_feat.contiguous()
         spatial_shape = spatial_shape.contiguous().int()
         scale_start_index = scale_start_index.contiguous().int()
-        sampling_location = sampling_location.contiguous().float()
-        weights = weights.contiguous().float()
+        sampling_location = sampling_location.contiguous()
+        weights = weights.contiguous()
 
         grad_mc_ms_feat = torch.zeros_like(mc_ms_feat)
-        grad_sampling_location_padding = torch.zeros((sampling_location.shape[0], sampling_location.shape[1], sampling_location.shape[2], sampling_location.shape[3], 8), device=npu_device)
+        grad_sampling_location = torch.zeros_like(sampling_location)
         grad_weights = torch.zeros_like(weights)
         grad_mc_ms_feat, grad_sampling_location, grad_weights = mx_driving._C.npu_deformable_aggregation_backward(
             mc_ms_feat,
@@ -73,10 +72,9 @@ class AdsDeformableAggregation(Function):
             weights,
             grad_output.contiguous(),
             grad_mc_ms_feat,
-            grad_sampling_location_padding,
+            grad_sampling_location,
             grad_weights,
         )
-        grad_sampling_location, _, _, _ = torch.chunk(grad_sampling_location_padding, 4, dim=-1)
         return (
             grad_mc_ms_feat,
             None,
-- 
Gitee