diff --git a/.gitignore b/.gitignore
index 9009958ceff818ec9a73be5894b0778432c437fe..414718b357f64052481506c064b9f396d6eb74cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-__pycache__/
-.DS_Store
-.idea
+__pycache__/
+.DS_Store
+.idea
 cmake-build-debug
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 43e15a4372b546976d1d80ed6c301636baba1c9f..cadb24304d9f5f9902693376a3575ab7c6df35a4 100644
--- a/LICENSE
+++ b/LICENSE
@@ -29,3 +29,207 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Copyright 2018-2019 Open-MMLab. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2019 Open-MMLab.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/ads/common/ops/kernels/op_kernel/CMakeLists.txt b/ads/common/ops/kernels/op_kernel/CMakeLists.txt
index 0cf5021494806dab37271b1c052809e09760f1b8..c51870f18c2b530409e6df2c4529ab5a63b32953 100644
--- a/ads/common/ops/kernels/op_kernel/CMakeLists.txt
+++ b/ads/common/ops/kernels/op_kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
 file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
 set(ASCEND_KERNEL_SRC
-    ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
-    CACHE INTERNAL "")
+        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
+        CACHE INTERNAL "")
diff --git a/ads/perception/fused/__init__.py b/ads/perception/fused/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..30f8aefb4911ded8289ee01e5ee5ac7400b2df05 100644
--- a/ads/perception/fused/__init__.py
+++ b/ads/perception/fused/__init__.py
@@ -0,0 +1,2 @@
+from .ops.bev_pool import bev_pool
+from .ops.bev_pool_v2 import bev_pool_v2
\ No newline at end of file
diff --git a/ads/perception/fused/ops/__init__.py b/ads/perception/fused/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ads/perception/fused/ops/bev_pool.py b/ads/perception/fused/ops/bev_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d31e6e2ebae68a76486c46d5c30af153678673a
--- /dev/null
+++ b/ads/perception/fused/ops/bev_pool.py
@@ -0,0 +1,64 @@
+import ads_c
+import torch
+
+
+class BEVPool(torch.autograd.Function):
+    @staticmethod
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(ctx, feat, geom_feat, ranks, B, D, H, W):
+        kept = torch.ones(feat.shape[0], device=feat.device, dtype=torch.bool)
+        kept[1:] = ranks[1:] != ranks[:-1]
+        interval_starts = torch.where(kept)[0].int()
+        interval_lengths = torch.zeros_like(interval_starts, dtype=torch.int32)
+        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+        interval_lengths[-1] = feat.shape[0] - interval_starts[-1]
+        geom_feat = geom_feat.int()
+
+        out = ads_c.npu_bev_pool(
+            feat,
+            geom_feat,
+            interval_lengths,
+            interval_starts,
+            B,
+            D,
+            H,
+            W,
+        )
+
+        ctx.save_for_backward(interval_starts, interval_lengths, geom_feat)
+        ctx.saved_shapes = B, D, H, W
+        return out
+
+    @staticmethod
+    # pylint: disable=too-many-return-values
+    def backward(ctx, grad_out):
+        interval_starts, interval_lengths, geom_feat = ctx.saved_tensors
+        B, D, H, W = ctx.saved_shapes
+
+        grad_out = grad_out.contiguous()
+        grad_feat = ads_c.npu_bev_pool_backward(
+            grad_out,
+            geom_feat,
+            interval_lengths,
+            interval_starts,
+            B,
+            D,
+            H,
+            W,
+        )
+
+        return grad_feat, None, None, None, None, None, None
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def bev_pool(feat, geom_feat, B, D, H, W):
+    if feat.shape[0] != geom_feat.shape[0]:
+        raise ValueError("The number of features and geometry features should be the same.")
+
+    ranks = geom_feat[:, 0] * (W * D * B) + geom_feat[:, 1] * (D * B) + geom_feat[:, 2] * B + geom_feat[:, 3]
+    indices = ranks.argsort()
+    feat, geom_feat, ranks = feat[indices], geom_feat[indices], ranks[indices]
+
+    out = BEVPool.apply(feat, geom_feat, ranks, B, D, H, W)
+    out = out.permute(0, 4, 1, 2, 3).contiguous()
+    return out
diff --git a/ads/perception/fused/ops/bev_pool_v2.py b/ads/perception/fused/ops/bev_pool_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a208276e67363228c591f228dbcb2f1c102dc43f
--- /dev/null
+++ b/ads/perception/fused/ops/bev_pool_v2.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (c) Phigent Robotics. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ads_c
+import torch
+
+
+class BEVPoolV2(torch.autograd.Function):
+    @staticmethod
+    # pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev,
+                bev_feat_shape, interval_starts, interval_lengths):
+        ranks_bev = ranks_bev.int()
+        depth = depth.contiguous().float()
+        feat = feat.contiguous().float()
+        ranks_depth = ranks_depth.contiguous().int()
+        ranks_feat = ranks_feat.contiguous().int()
+        interval_lengths = interval_lengths.contiguous().int()
+        interval_starts = interval_starts.contiguous().int()
+
+        (B, D, H, W, C) = bev_feat_shape
+        out = ads_c.npu_bev_pool_v2(
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths,
+            interval_starts,
+            B,
+            D,
+            H,
+            W
+        )
+
+        ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
+        ctx.saved_shapes = B, D, H, W
+        return out
+
+    @staticmethod
+    # pylint: disable=too-many-return-values
+    def backward(ctx, grad_out):
+        ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors
+        B, D, H, W = ctx.saved_shapes
+
+        order = ranks_feat.argsort()
+        ranks_feat, ranks_depth, ranks_bev = \
+            ranks_feat[order], ranks_depth[order], ranks_bev[order]
+        kept = torch.ones(
+            ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
+        interval_starts_bp = torch.where(kept)[0].int()
+        interval_lengths_bp = torch.zeros_like(interval_starts_bp)
+        interval_lengths_bp[:-1] = interval_starts_bp[
+                                   1:] - interval_starts_bp[:-1]
+        interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1]
+
+        depth = depth.contiguous()
+        feat = feat.contiguous()
+        ranks_depth = ranks_depth.contiguous()
+        ranks_feat = ranks_feat.contiguous()
+        ranks_bev = ranks_bev.contiguous()
+        interval_lengths_bp = interval_lengths_bp.contiguous()
+        interval_starts_bp = interval_starts_bp.contiguous()
+        grad_out = grad_out.contiguous()
+
+        grad_depth, grad_feat = ads_c.npu_bev_pool_v2_backward(
+            grad_out,
+            depth,
+            feat,
+            ranks_depth,
+            ranks_feat,
+            ranks_bev,
+            interval_lengths_bp,
+            interval_starts_bp,
+            B,
+            D,
+            H,
+            W
+        )
+        return grad_depth, grad_feat, None, None, None, None, None, None
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
+                bev_feat_shape, interval_starts, interval_lengths):
+    x = BEVPoolV2.apply(
+        depth, feat, ranks_depth, ranks_feat, ranks_bev,
+        bev_feat_shape, interval_starts, interval_lengths
+    )
+    x = x.permute(0, 4, 1, 2, 3).contiguous()
+    return x
diff --git a/ads/perception/fused/ops/csrc/BEVPool.cpp b/ads/perception/fused/ops/csrc/BEVPool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47332f0711aed7388ed81faf2c82c5a600f07ee6
--- /dev/null
+++ b/ads/perception/fused/ops/csrc/BEVPool.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "csrc/OpApiCommon.h"
+#include "functions.h"
+
+namespace {
+constexpr int64_t N_IDX = 0;
+constexpr int64_t C_IDX = 1;
+constexpr int64_t N_INTERVAL_IDX = 0;
+
+void check_npu(const at::Tensor& feat, const at::Tensor& geom_feat, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts)
+{
+    TORCH_CHECK_NPU(feat);
+    TORCH_CHECK_NPU(geom_feat);
+    TORCH_CHECK_NPU(interval_lengths);
+    TORCH_CHECK_NPU(interval_starts);
+}
+} // namespace
+
+/**
+ * @brief pillar pooling, bev_pool
+ * @param feat: input feature, 2D tensor(n, c)
+ * @param geom_feat: input coords, 2D tensor(n, 4)
+ * @param interval_lengths: the number of points in each interval, 1D tensor(n_interval)
+ * @param interval_starts: starting position for pooled point, 1D tensor(n_interval)
+ * @param b: batch_size, int64
+ * @param d: depth, int64
+ * @param h: height, int64
+ * @param w: width, int64
+ * @return out: output feature, 5D tensor(b, d, h, w, c)
+ */
+at::Tensor npu_bev_pool(const at::Tensor& feat, const at::Tensor& geom_feat, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w)
+{
+    TORCH_CHECK(feat.dim() == 2, "feat must be 2D tensor(n, c)");
+    TORCH_CHECK(geom_feat.dim() == 2, "coords must be 2D tensor(n, 4)");
+    check_npu(feat, geom_feat, interval_lengths, interval_starts);
+
+    auto n = geom_feat.size(N_IDX);
+    auto c = feat.size(C_IDX);
+    auto n_interval = interval_lengths.size(N_INTERVAL_IDX);
+    TORCH_CHECK(
+        interval_starts.size(N_INTERVAL_IDX) == n_interval, "interval_starts and interval_lengths must have same size");
+
+    auto out = at::zeros({b, d, h, w, c}, feat.options());
+    EXEC_NPU_CMD(aclnnBEVPool, feat, geom_feat, interval_lengths, interval_starts, b, d, h, w, c, out);
+    return out;
+}
diff --git a/ads/perception/fused/ops/csrc/BEVPoolBackward.cpp b/ads/perception/fused/ops/csrc/BEVPoolBackward.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..262c4584a736d6aa63874faeba886d24d790756f
--- /dev/null
+++ b/ads/perception/fused/ops/csrc/BEVPoolBackward.cpp
@@ -0,0 +1,62 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "csrc/OpApiCommon.h"
+#include "functions.h"
+
+namespace {
+constexpr int64_t N_IDX = 0;
+constexpr int64_t C_IDX = 4;
+constexpr int64_t N_INTERVAL_IDX = 0;
+
+void check_npu(const at::Tensor& grad_out, const at::Tensor& geom_feat, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts)
+{
+    TORCH_CHECK_NPU(grad_out);
+    TORCH_CHECK_NPU(geom_feat);
+    TORCH_CHECK_NPU(interval_lengths);
+    TORCH_CHECK_NPU(interval_starts);
+}
+} // namespace
+
+/**
+ * @brief pillar pooling, bev_pool_backward
+ * @param grad_out: input grad, 5D tensor(b, d, h, w, c)
+ * @param geom_feat: input coords, 2D tensor(n, 4)
+ * @param interval_lengths: the number of points in each interval, 1D tensor(n_interval)
+ * @param interval_starts: starting position for pooled point, 1D tensor(n_interval)
+ * @param b: batch_size, int64
+ * @param d: depth, int64
+ * @param h: height, int64
+ * @param w: width, int64
+ * @return grad_feat: output grad, 2D tensor(n, c)
+ */
+at::Tensor npu_bev_pool_backward(const at::Tensor& grad_out, const at::Tensor& geom_feat,
+    const at::Tensor& interval_lengths, const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w)
+{
+    TORCH_CHECK(grad_out.dim() == 5, "grad_out must be 5D tensor(b, d, h, w, c)");
+    TORCH_CHECK(geom_feat.dim() == 2, "coords must be 2D tensor(n, 4)");
+    check_npu(grad_out, geom_feat, interval_lengths, interval_starts);
+    auto n = geom_feat.size(N_IDX);
+    auto c = grad_out.size(C_IDX);
+    auto n_interval = interval_lengths.size(N_INTERVAL_IDX);
+    TORCH_CHECK(
+        interval_starts.size(N_INTERVAL_IDX) == n_interval, "interval_starts and interval_lengths must have same size");
+
+    auto grad_feat = at::zeros({n, c}, grad_out.options());
+    EXEC_NPU_CMD(aclnnBEVPoolGrad, grad_out, geom_feat, interval_lengths, interval_starts, b, d, h, w, c, grad_feat);
+    return grad_feat;
+}
diff --git a/ads/perception/fused/ops/csrc/BEVPoolV2.cpp b/ads/perception/fused/ops/csrc/BEVPoolV2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2268d7a371d6437a1417c3be3492ad70d7b01a0
--- /dev/null
+++ b/ads/perception/fused/ops/csrc/BEVPoolV2.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "csrc/OpApiCommon.h"
+#include "functions.h"
+
+namespace {
+constexpr int64_t C_IDX = 4;
+
+void check_npu(const at::Tensor& depth, const at::Tensor& feat, const at::Tensor& ranks_depth,
+    const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts)
+{
+    TORCH_CHECK_NPU(depth);
+    TORCH_CHECK_NPU(feat);
+    TORCH_CHECK_NPU(ranks_depth);
+    TORCH_CHECK_NPU(ranks_feat);
+    TORCH_CHECK_NPU(ranks_bev);
+    TORCH_CHECK_NPU(interval_lengths);
+    TORCH_CHECK_NPU(interval_starts);
+}
+} // namespace
+
+/**
+ * @brief pillar pooling, bev_pool_v2_backward
+ * @param depth: input depth, 5D tensor(b, n, d, h, w)
+ * @param feat: input feature, 5D tensor(b, n, h, w, c)
+ * @param ranks_depth: input depth rank, 1D tensor
+ * @param ranks_feat: input feature rank, 1D tensor
+ * @param ranks_bev: input bev rank, 1D tensor
+ * @param interval_lengths: the number of points in each interval, 1D tensor(n_interval)
+ * @param interval_starts: starting position for pooled point, 1D tensor(n_interval)
+ * @param b: batch_size, int64
+ * @param d: depth, int64
+ * @param h: height, int64
+ * @param w: width, int64
+ * @return out: output feature, 5D tensor(b, d, h, w, c)
+ */
+at::Tensor npu_bev_pool_v2(const at::Tensor& depth, const at::Tensor& feat, const at::Tensor& ranks_depth,
+    const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w)
+{
+    check_npu(depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts);
+    auto c = feat.size(C_IDX);
+    auto out = at::zeros({b, d, h, w, c}, feat.options());
+    EXEC_NPU_CMD(aclnnBEVPoolV2, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts, b,
+        d, h, w, c, out);
+    return out;
+}
diff --git a/ads/perception/fused/ops/csrc/BEVPoolV2Backward.cpp b/ads/perception/fused/ops/csrc/BEVPoolV2Backward.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebeca36c0c52e16824b1007dc9f99ca6aeb5aa56
--- /dev/null
+++ b/ads/perception/fused/ops/csrc/BEVPoolV2Backward.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "csrc/OpApiCommon.h"
+#include "functions.h"
+
+namespace {
+constexpr int64_t C_IDX = 4;
+void check_npu(const at::Tensor& depth, const at::Tensor& feat, const at::Tensor& ranks_depth,
+    const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts)
+{
+    TORCH_CHECK_NPU(depth);
+    TORCH_CHECK_NPU(feat);
+    TORCH_CHECK_NPU(ranks_depth);
+    TORCH_CHECK_NPU(ranks_feat);
+    TORCH_CHECK_NPU(ranks_bev);
+    TORCH_CHECK_NPU(interval_lengths);
+    TORCH_CHECK_NPU(interval_starts);
+}
+} // namespace
+
+/**
+ * @brief pillar pooling, bev_pool_v2_backward
+ * @param grad_out: input grad, 5D tensor(b, d, h, w, c)
+ * @param depth: input depth, 5D tensor(b, n, d, h, w)
+ * @param feat: input feature, 5D tensor(b, n, h, w, c)
+ * @param ranks_depth: input depth rank, 1D tensor
+ * @param ranks_feat: input feature rank, 1D tensor
+ * @param ranks_bev: input bev rank, 1D tensor
+ * @param interval_lengths: the number of points in each interval, 1D tensor(n_interval)
+ * @param interval_starts: starting position for pooled point, 1D tensor(n_interval)
+ * @param b: batch_size, int64
+ * @param d: depth, int64
+ * @param h: height, int64
+ * @param w: width, int64
+ * @return grad_depth: output grad, 5D tensor(b, n, d, h, w)
+ * @return grad_feat: output grad, 5D tensor(b, n, h, w, c)
+ */
+std::tuple<at::Tensor, at::Tensor> npu_bev_pool_v2_backward(const at::Tensor& grad_out, const at::Tensor& depth,
+    const at::Tensor& feat, const at::Tensor& ranks_depth, const at::Tensor& ranks_feat, const at::Tensor& ranks_bev,
+    const at::Tensor& interval_lengths, const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w)
+{
+    check_npu(depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts);
+    auto depth_sizes = depth.sizes();
+    auto feat_sizes = feat.sizes();
+    auto grad_depth = at::zeros(depth_sizes, depth.options());
+    auto grad_feat = at::zeros(feat_sizes, depth.options());
+    auto c = feat.size(C_IDX);
+
+    EXEC_NPU_CMD(aclnnBEVPoolV2Grad, grad_out, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths,
+        interval_starts, b, d, h, w, c, grad_depth, grad_feat);
+    return std::make_tuple(grad_depth, grad_feat);
+}
diff --git a/ads/perception/fused/ops/csrc/functions.h b/ads/perception/fused/ops/csrc/functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8335acf2d72599bc1f23185ad5c12c151672117
--- /dev/null
+++ b/ads/perception/fused/ops/csrc/functions.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2024, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
+#define PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
+#include <ATen/Tensor.h>
+#include <torch/library.h>
+
+
+at::Tensor npu_bev_pool(const at::Tensor& feat, const at::Tensor& geom_feat, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w);
+at::Tensor npu_bev_pool_backward(const at::Tensor& grad_out, const at::Tensor& geom_feat,
+    const at::Tensor& interval_lengths, const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w);
+
+at::Tensor npu_bev_pool_v2(const at::Tensor& depth, const at::Tensor& feat, const at::Tensor& ranks_depth,
+    const at::Tensor& ranks_feat, const at::Tensor& ranks_bev, const at::Tensor& interval_lengths,
+    const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w);
+std::tuple<at::Tensor, at::Tensor> npu_bev_pool_v2_backward(const at::Tensor& grad_out, const at::Tensor& depth,
+    const at::Tensor& feat, const at::Tensor& ranks_depth, const at::Tensor& ranks_feat, const at::Tensor& ranks_bev,
+    const at::Tensor& interval_lengths, const at::Tensor& interval_starts, int64_t b, int64_t d, int64_t h, int64_t w);
+#endif // PERCEPTION_FUSED_OPS_CSRC_FUNCTIONS_H_
diff --git a/ads/perception/fused/ops/csrc/pybind.cpp b/ads/perception/fused/ops/csrc/pybind.cpp
index f37707a3d103f9e06d1391660b8ed59650ac9c9b..d4001c57067c9e5d83e0e3af956b5dbef8f8d043 100644
--- a/ads/perception/fused/ops/csrc/pybind.cpp
+++ b/ads/perception/fused/ops/csrc/pybind.cpp
@@ -1,5 +1,13 @@
-#include <torch/extension.h>
 #include "csrc/pybind.h"
 
-void init_perception_fused(pybind11::module& m) {
+#include <torch/extension.h>
+
+#include "functions.h"
+void init_perception_fused(pybind11::module& m)
+{
+    // bev_pool
+    m.def("npu_bev_pool", &npu_bev_pool, "npu_bev_pool NPU version");
+    m.def("npu_bev_pool_backward", &npu_bev_pool_backward, "npu_bev_pool_backward NPU version");
+    m.def("npu_bev_pool_v2", &npu_bev_pool_v2, "npu_bev_pool_v2 NPU version");
+    m.def("npu_bev_pool_v2_backward", &npu_bev_pool_v2_backward, "npu_bev_pool_v2_backward NPU version");
 }
diff --git a/ads/perception/fused/ops/kernels/op_host/CMakeLists.txt b/ads/perception/fused/ops/kernels/op_host/CMakeLists.txt
index 75a458e050d95c942754bc7a65a55ef44c004832..7e8c1aa351dc3e9bfa77dd39afa8885c55943c2b 100644
--- a/ads/perception/fused/ops/kernels/op_host/CMakeLists.txt
+++ b/ads/perception/fused/ops/kernels/op_host/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB HOST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
-set(
+set(ASCEND_HOST_SRC
     ${ASCEND_HOST_SRC} ${HOST_SRC}
     CACHE INTERNAL "")
 
diff --git a/ads/perception/fused/ops/kernels/op_host/bev_pool.cpp b/ads/perception/fused/ops/kernels/op_host/bev_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5fbc2e3d7e24dbdaaf6f4ce8246030ab5865ae3
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_host/bev_pool.cpp
@@ -0,0 +1,407 @@
+#include <graph/types.h>
+#include <register/op_def.h>
+
+#include <cstdint>
+
+#include "bev_pool_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+namespace {
+constexpr size_t FEAT_IDX = 0;
+constexpr size_t GEOM_FEAT_IDX = 1;
+constexpr size_t INTERVAL_IDX_V1 = 3;
+constexpr size_t INTERVAL_IDX_V2 = 6;
+constexpr size_t B_IDX = 0;
+constexpr size_t D_IDX = 1;
+constexpr size_t H_IDX = 2;
+constexpr size_t W_IDX = 3;
+constexpr size_t C_IDX = 4;
+
+constexpr int32_t TILING_ALIGN32B_FLAG = 1;
+constexpr int32_t TILING_FP32_BIT = 1;
+constexpr int32_t TILING_FP16_BIT = 2;
+constexpr int32_t TILING_BF16_BIT = 3;
+
+int32_t GetTilingKey(const ge::DataType dtype, optiling::BEVPoolTilingData& tiling)
+{
+    auto dtypeBytes = ge::GetSizeByDataType(dtype);
+    int32_t cBytes = tiling.get_stride0() * dtypeBytes;
+    int32_t key = cBytes % 32 == 0 ? TILING_ALIGN32B_FLAG : 0;
+    switch (dtype) {
+        case ge::DT_FLOAT:
+            key |= 1 << TILING_FP32_BIT;
+            break;
+        case ge::DT_FLOAT16:
+            key |= 1 << TILING_FP16_BIT;
+            break;
+        case ge::DT_BF16:
+            key |= 1 << TILING_BF16_BIT;
+            break;
+        default:
+            break; // here, fail-safe is not a good idea
+    }
+    return key;
+}
+
+enum Version {
+    V1,
+    V2
+};
+} // namespace
+
+namespace optiling {
+template<Version version>
+static ge::graphStatus TilingForBEVPool(gert::TilingContext* context)
+{
+    BEVPoolTilingData tiling;
+    auto platform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    auto coreNum = platform.GetCoreNum();
+
+    auto intervalShape =
+        version == V1 ? context->GetInputShape(INTERVAL_IDX_V1) : context->GetInputShape(INTERVAL_IDX_V2);
+    int32_t nInterval = intervalShape->GetStorageShape().GetDim(0);
+
+    int32_t usedCoreNum = std::min(static_cast<int32_t>(coreNum), nInterval);
+    tiling.set_usedCoreNum(usedCoreNum);
+    if (usedCoreNum == 0) {
+        return ge::GRAPH_FAILED;
+    }
+    auto avgTaskNum = nInterval / usedCoreNum;
+    auto tailTaskNum = nInterval % usedCoreNum;
+    tiling.set_totalTaskNum(nInterval);
+    tiling.set_avgTaskNum(avgTaskNum);
+    tiling.set_tailTaskNum(tailTaskNum);
+
+    auto attrs = context->GetAttrs();
+    if (!attrs) {
+        return ge::GRAPH_FAILED;
+    }
+    auto getAttr = [attrs](size_t idx) -> int32_t {
+        auto ptr = attrs->GetInt(idx);
+        if (!ptr) {
+            return -1;
+        }
+        return static_cast<int32_t>(*ptr);
+    };
+    auto b = getAttr(B_IDX);
+    auto d = getAttr(D_IDX);
+    auto h = getAttr(H_IDX);
+    auto w = getAttr(W_IDX);
+    auto c = getAttr(C_IDX);
+    if (b < 0 || d < 0 || h < 0 || w < 0 || c < 0) {
+        return ge::GRAPH_FAILED;
+    }
+    tiling.set_stride0(c);
+    tiling.set_stride1(w * c);
+    tiling.set_stride2(h * w * c);
+    tiling.set_stride3(d * h * w * c);
+
+    auto dtype = context->GetInputDesc(FEAT_IDX)->GetDataType();
+    context->SetTilingKey(GetTilingKey(dtype, tiling));
+    context->SetBlockDim(usedCoreNum);
+
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ge {
+static graphStatus InferShapeForBEVPool(gert::InferShapeContext* context)
+{
+    auto attrs = context->GetAttrs();
+    auto getAttr = [attrs](size_t idx) -> int32_t {
+        auto ptr = attrs->GetInt(idx);
+        if (!ptr) {
+            return -1;
+        }
+        return static_cast<int32_t>(*ptr);
+    };
+    auto b = getAttr(B_IDX);
+    auto d = getAttr(D_IDX);
+    auto h = getAttr(H_IDX);
+    auto w = getAttr(W_IDX);
+    auto c = getAttr(C_IDX);
+    if (b < 0 || d < 0 || h < 0 || w < 0 || c < 0) {
+        return ge::GRAPH_FAILED;
+    }
+    gert::Shape* outShape = context->GetOutputShape(0);
+    *outShape = {b, d, h, w, c};
+    return GRAPH_SUCCESS;
+}
+
+static graphStatus InferShapeForBEVPoolGrad(gert::InferShapeContext* context)
+{
+    const gert::Shape* GeomFeatShape = context->GetInputShape(GEOM_FEAT_IDX);
+    const auto n = static_cast<int32_t>(GeomFeatShape->GetDim(0));
+    auto attrs = context->GetAttrs();
+    auto c = static_cast<int32_t>(*attrs->GetInt(C_IDX));
+    gert::Shape* gradfeatShape = context->GetOutputShape(0);
+    *gradfeatShape = {n, c};
+    return GRAPH_SUCCESS;
+}
+
+static graphStatus InferShapeForBEVPoolV2Grad(gert::InferShapeContext* context)
+{
+    gert::Shape* gradDepthShape = context->GetOutputShape(0);
+    const gert::Shape* depthShape = context->GetInputShape(1);
+    *gradDepthShape = *depthShape;
+    gert::Shape* gradFeatShape = context->GetOutputShape(1);
+    const gert::Shape* featShape = context->GetInputShape(2);
+    *gradFeatShape = *featShape;
+    return GRAPH_SUCCESS;
+}
+} // namespace ge
+
+namespace ops {
+class BEVPool : public OpDef {
+public:
+    explicit BEVPool(const char* name) : OpDef(name)
+    {
+        this->Input("feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("geom_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_lengths")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_starts")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("out")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("b").AttrType(REQUIRED).Int();
+        this->Attr("d").AttrType(REQUIRED).Int();
+        this->Attr("h").AttrType(REQUIRED).Int();
+        this->Attr("w").AttrType(REQUIRED).Int();
+        this->Attr("c").AttrType(REQUIRED).Int();
+
+        this->SetInferShape(ge::InferShapeForBEVPool);
+
+        this->AICore().SetTiling(optiling::TilingForBEVPool<V1>);
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+
+/**
+ * @brief: BEVPoolGrad, the backward of bev_pool
+ * @par Inputs:
+ * grad_out: input grad, 5D tensor(b, d, h, w, c), dtype: float32, format:
+ * NDHWC, ND geom_feat: input coords, 2D tensor(n, 4), dtype: int32, format: ND
+ * interval_starts: starting position for pooled point, 1D tensor(n_interval),
+ * dtype: int32, format: ND interval_lengths: the number of points in each
+ * interval, 1D tensor(n_interval), dtype: int32, format: ND
+ * @par Outputs:
+ * grad_feat: output grad, 2D tensor(n, c), dtype: float32, format: ND
+ * @par Attributes:
+ * b: batch size, type: int
+ * d: depth, type: int
+ * w: width, type: int
+ * h: height, type: int
+ * n: number of points, type: int
+ * c: channels, type: int
+ **/
+class BEVPoolGrad : public OpDef {
+public:
+    explicit BEVPoolGrad(const char* name) : OpDef(name)
+    {
+        this->Input("grad_out")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("geom_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_lengths")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_starts")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("grad_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("b").AttrType(REQUIRED).Int();
+        this->Attr("d").AttrType(REQUIRED).Int();
+        this->Attr("h").AttrType(REQUIRED).Int();
+        this->Attr("w").AttrType(REQUIRED).Int();
+        this->Attr("c").AttrType(REQUIRED).Int();
+
+        this->SetInferShape(ge::InferShapeForBEVPoolGrad);
+
+        this->AICore().SetTiling(optiling::TilingForBEVPool<V1>);
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+
+class BEVPoolV2 : public OpDef {
+public:
+    explicit BEVPoolV2(const char* name) : OpDef(name)
+    {
+        this->Input("depth")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_depth")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_bev")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_lengths")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_starts")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("out")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("b").AttrType(REQUIRED).Int();
+        this->Attr("d").AttrType(REQUIRED).Int();
+        this->Attr("h").AttrType(REQUIRED).Int();
+        this->Attr("w").AttrType(REQUIRED).Int();
+        this->Attr("c").AttrType(REQUIRED).Int();
+
+        this->SetInferShape(ge::InferShapeForBEVPool);
+
+        this->AICore().SetTiling(optiling::TilingForBEVPool<V2>);
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+
+/**
+ * @brief: BEVPoolGrad, the backward of bev_pool
+ * @par Inputs:
+ * grad_out: input grad, 5D tensor(b, d, h, w, c), dtype: float32, format:
+ * NDHWC, ND geom_feat: input coords, 2D tensor(n, 4), dtype: int32, format: ND
+ * interval_starts: starting position for pooled point, 1D tensor(n_interval),
+ * dtype: int32, format: ND interval_lengths: the number of points in each
+ * interval, 1D tensor(n_interval), dtype: int32, format: ND
+ * @par Outputs:
+ * grad_feat: output grad, 2D tensor(n, c), dtype: float32, format: ND
+ * @par Attributes:
+ **/
+class BEVPoolV2Grad : public OpDef {
+public:
+    explicit BEVPoolV2Grad(const char* name) : OpDef(name)
+    {
+        this->Input("grad_out")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("depth")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_depth")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("ranks_bev")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_lengths")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("interval_starts")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Output("grad_depth")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("grad_feat")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->Attr("b").AttrType(REQUIRED).Int();
+        this->Attr("d").AttrType(REQUIRED).Int();
+        this->Attr("h").AttrType(REQUIRED).Int();
+        this->Attr("w").AttrType(REQUIRED).Int();
+        this->Attr("c").AttrType(REQUIRED).Int();
+
+        this->SetInferShape(ge::InferShapeForBEVPoolV2Grad);
+
+        this->AICore().SetTiling(optiling::TilingForBEVPool<V2>);
+        this->AICore().AddConfig("ascend910b");
+    }
+};
+OP_ADD(BEVPool);
+OP_ADD(BEVPoolGrad);
+OP_ADD(BEVPoolV2);
+OP_ADD(BEVPoolV2Grad);
+} // namespace ops
diff --git a/ads/perception/fused/ops/kernels/op_host/bev_pool_tiling.h b/ads/perception/fused/ops/kernels/op_host/bev_pool_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f2dc06c684b70a38d39ffcf6463381e4946fd24
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_host/bev_pool_tiling.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#ifndef BEV_POOL_TILING_H
+#define BEV_POOL_TILING_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(BEVPoolTilingData)
+TILING_DATA_FIELD_DEF(int32_t, usedCoreNum)
+TILING_DATA_FIELD_DEF(int32_t, avgTaskNum)
+TILING_DATA_FIELD_DEF(int32_t, tailTaskNum)
+TILING_DATA_FIELD_DEF(int32_t, totalTaskNum)
+TILING_DATA_FIELD_DEF(int32_t, stride0)
+TILING_DATA_FIELD_DEF(int32_t, stride1)
+TILING_DATA_FIELD_DEF(int32_t, stride2)
+TILING_DATA_FIELD_DEF(int32_t, stride3)
+END_TILING_DATA_DEF
+
+REGISTER_TILING_DATA_CLASS(BEVPool, BEVPoolTilingData)
+REGISTER_TILING_DATA_CLASS(BEVPoolGrad, BEVPoolTilingData)
+REGISTER_TILING_DATA_CLASS(BEVPoolV2, BEVPoolTilingData)
+REGISTER_TILING_DATA_CLASS(BEVPoolV2Grad, BEVPoolTilingData)
+} // namespace optiling
+#endif // BEV_POOL_TILING_H
diff --git a/ads/perception/fused/ops/kernels/op_kernel/CMakeLists.txt b/ads/perception/fused/ops/kernels/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c51870f18c2b530409e6df2c4529ab5a63b32953
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/CMakeLists.txt
@@ -0,0 +1,4 @@
+file(GLOB KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+set(ASCEND_KERNEL_SRC
+        ${ASCEND_KERNEL_SRC} ${KERNEL_SRC}
+        CACHE INTERNAL "")
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool.cpp b/ads/perception/fused/ops/kernels/op_kernel/bev_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..930918615ccaf1d88c8186bef3fd78035e41d79f
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool.cpp
@@ -0,0 +1,60 @@
+#include "bev_pool.h"
+using namespace AscendC;
+
+namespace BEVPool {
+template<typename T, bool Align32B>
+__aicore__ inline void BEVPoolKernel<T, Align32B>::DoProcess()
+{
+    LocalTensor<T> outT = outQue_.AllocTensor<T>(); // wait_flag(met3, v)
+    Duplicate(outT, T(0.f), this->alignUpCCount_);  // pipe_v
+    for (int32_t i = 0; i < this->length_; ++i) {
+        LocalTensor<T> featT = featQue_.AllocTensor<T>();                    // wait_flag(v, mte2)
+        DataCopy(featT, this->fGm_[this->featOffset_], this->cpFeatParams_); // met2
+        featQue_.EnQue(featT);                                               // set_flag(mte2, v)
+        featT = featQue_.DeQue<T>();                                         // wait_flag(mte2, v)
+        Add(outT, featT, outT, this->alignUpCCount_);                        // pipe_v
+        featQue_.FreeTensor(featT);                                          // set_flag(v, mte2)
+        this->featOffset_ += this->stride0_;
+    }
+    outQue_.EnQue(outT); // set_flag(v, mte3)
+
+    outT = outQue_.DeQue<T>(); // wait_flag(v, mte3)
+    if (Align32B) {
+        DataCopy(this->oGm_[this->outOffset_], outT, this->cpFeatParams_); // mte3
+    } else {
+        DataCopyPad(this->oGm_[this->outOffset_], outT, this->cpPadParams_); // mte3
+    }
+    outQue_.FreeTensor(outT); // set_flag(mte3, v)
+}
+} // namespace BEVPool
+
+extern "C" __global__ __aicore__ void bev_pool(GM_ADDR feat, GM_ADDR geomFeat, GM_ADDR intervalLengths,
+    GM_ADDR intervalStarts, GM_ADDR out, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(bevPoolTiling, tiling);
+    int32_t blkIdx = GetBlockIdx();
+    int32_t c = bevPoolTiling.stride0; // channel
+// the tiling key represented as below:
+// +----+----+----+-----+
+// |bf16|fp16|fp32|align|
+// +----+----+----+-----+
+#if __CCE_AICORE__ == 220
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+#endif
+    if (TILING_KEY_IS(3)) { // 1 << BEVPool::TILING_FP32_BIT | BEVPool::TILING_ALIGN32B_FLAG
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPool::BEVPoolKernel<float, true> op(blkIdx, cBytes, divCeilC, alignUpCBytes, feat, geomFeat, intervalLengths,
+            intervalStarts, out, bevPoolTiling);
+        op.Process();
+    } else if (TILING_KEY_IS(2)) { // 1 << BEVPool::TILING_FP32_BIT
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPool::BEVPoolKernel<float, false> op(blkIdx, cBytes, divCeilC, alignUpCBytes, feat, geomFeat,
+            intervalLengths, intervalStarts, out, bevPoolTiling);
+        op.Process();
+    } // we just support fp32 at present
+    PipeBarrier<PIPE_ALL>();
+}
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool.h b/ads/perception/fused/ops/kernels/op_kernel/bev_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..372a9ef81b94e275eb29160403182b0b9379961a
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ */
+#ifndef BEV_POOL_H_
+#define BEV_POOL_H_
+
+#include "common.h"
+
+namespace BEVPool {
+constexpr int32_t BUFFER_NUM = 2; // double buffer
+
+template<int32_t depth>
+class TConjungateQue : public AscendC::TQueBind<AscendC::TPosition::LCM, AscendC::TPosition::LCM, depth, 0> {
+public:
+    __aicore__ inline TConjungateQue() = default;
+};
+
+template<typename T, bool Align32B>
+class BEVPoolBaseKernel {
+public:
+    __aicore__ inline BEVPoolBaseKernel() = delete;
+
+    __aicore__ inline BEVPoolBaseKernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR geomFeat, GM_ADDR intervalLengths, GM_ADDR intervalStarts, const BEVPoolTilingData& bevPoolTiling)
+        : blkIdx_(blkIdx), it_(blkIdx, bevPoolTiling.usedCoreNum, bevPoolTiling.avgTaskNum, bevPoolTiling.tailTaskNum,
+                               bevPoolTiling.totalTaskNum),
+          cpFeatParams_(1, divCeilC, 0, 0), cpOneParams_(1, 1, 0, 0), alignUpCCount_(alignUpCBytes / sizeof(T)),
+          cpPadParams_(1, cBytes, 0, 0, 0)
+    {
+        stride0_ = bevPoolTiling.stride0;
+        stride1_ = bevPoolTiling.stride1;
+        stride2_ = bevPoolTiling.stride2;
+        stride3_ = bevPoolTiling.stride3;
+
+        sGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(intervalStarts));
+        lGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(intervalLengths));
+        gGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(geomFeat));
+
+        pipe_.InitBuffer(geomQue_, BUFFER_NUM, AscendC::ONE_BLK_SIZE); // geom
+    }
+
+protected:
+    int32_t blkIdx_;
+
+    AscendC::TPipe pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> geomQue_;
+
+    AscendC::GlobalTensor<T> fGm_, oGm_;
+    AscendC::GlobalTensor<int32_t> sGm_, lGm_, gGm_;
+
+    int32_t stride0_, stride1_, stride2_, stride3_;
+    int32_t alignUpCCount_;
+    TaskIterator it_;
+
+    AscendC::DataCopyParams cpFeatParams_, cpOneParams_;
+    AscendC::DataCopyExtParams cpPadParams_;
+    int32_t length_, featOffset_, outOffset_;
+
+    __aicore__ inline void PreProcess(int32_t idx)
+    {
+        int32_t start = sGm_.GetValue(idx);
+        featOffset_ = start * stride0_;
+        length_ = lGm_.GetValue(idx);
+
+        AscendC::LocalTensor<int32_t> geomT = geomQue_.AllocTensor<int32_t>(); // wait_flag(v, mte2)
+        DataCopy(geomT, gGm_[4 * start], cpOneParams_);               // pipe_v
+        geomQue_.EnQue(geomT);                                        // set_flag(mte2, v)
+        geomT = geomQue_.DeQue<int32_t>();                            // wait_flag(mte2, v)
+        outOffset_ = geomT.GetValue(1) * stride0_ + geomT.GetValue(0) * stride1_ + geomT.GetValue(2) * stride2_ +
+                     geomT.GetValue(3) * stride3_;
+        geomQue_.FreeTensor(geomT); // set_flag(v, mte2)
+    }
+};
+
+template<typename T, bool Align32B>
+class BEVPoolKernel : public BEVPoolBaseKernel<T, Align32B> {
+public:
+    __aicore__ inline BEVPoolKernel() = delete;
+
+    __aicore__ inline BEVPoolKernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR feat, GM_ADDR geomFeat, GM_ADDR intervalLengths, GM_ADDR intervalStarts, GM_ADDR out,
+        const BEVPoolTilingData& bevPoolTiling)
+        : BEVPoolBaseKernel<T, Align32B>(
+              blkIdx, cBytes, divCeilC, alignUpCBytes, geomFeat, intervalLengths, intervalStarts, bevPoolTiling)
+
+    {
+        this->oGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(out));
+        this->fGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(feat));
+
+        this->pipe_.InitBuffer(featQue_, BUFFER_NUM, alignUpCBytes);
+        this->pipe_.InitBuffer(outQue_, BUFFER_NUM, alignUpCBytes);
+    }
+
+    __aicore__ inline void Process()
+    {
+        while (this->it_.HasNext()) {
+            const int32_t idx = this->it_.Next();
+            this->PreProcess(idx);
+            DoProcess();
+        }
+    }
+
+private:
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> featQue_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQue_;
+
+    __aicore__ inline void DoProcess();
+};
+
+template<typename T, bool Align32B>
+class BEVPoolGradKernel : public BEVPoolBaseKernel<T, Align32B> {
+public:
+    __aicore__ inline BEVPoolGradKernel() = delete;
+
+    __aicore__ inline BEVPoolGradKernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR gradOut, GM_ADDR geomFeat, GM_ADDR intervalLengths, GM_ADDR intervalStarts, GM_ADDR gradFeat,
+        const BEVPoolTilingData& bevPoolTiling)
+        : BEVPoolBaseKernel<T, Align32B>(
+              blkIdx, cBytes, divCeilC, alignUpCBytes, geomFeat, intervalLengths, intervalStarts, bevPoolTiling)
+    {
+        this->oGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(gradOut));
+        this->fGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(gradFeat));
+
+        this->pipe_.InitBuffer(que_, BUFFER_NUM, alignUpCBytes);
+    }
+
+    __aicore__ inline void Process()
+    {
+        while (this->it_.HasNext()) {
+            const int32_t idx = this->it_.Next();
+            this->PreProcess(idx);
+            DoProcess();
+        }
+    }
+
+private:
+    TConjungateQue<BUFFER_NUM> que_;
+
+    __aicore__ inline void DoProcess();
+};
+} // namespace BEVPool
+#endif // BEV_POOL_H_
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool_grad.cpp b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_grad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff561c6efb5d23847c66a9e35498aa89f69b5a20
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_grad.cpp
@@ -0,0 +1,55 @@
+#include "bev_pool.h"
+using namespace AscendC;
+
+namespace BEVPool {
+template<typename T, bool Align32B>
+__aicore__ inline void BEVPoolGradKernel<T, Align32B>::DoProcess()
+{
+    LocalTensor<T> gradOutT = que_.AllocTensor<T>();
+    DataCopy(gradOutT, this->oGm_[this->outOffset_], this->cpFeatParams_);
+    que_.EnQue(gradOutT); // set_flag(mte2, mte3)
+
+    gradOutT = que_.DeQue<T>(); // wait_flag(mte2, mte3)
+    for (int32_t i = 0; i < this->length_; ++i) {
+        if (Align32B) {
+            DataCopy(this->fGm_[this->featOffset_], gradOutT, this->cpFeatParams_);
+        } else {
+            DataCopyPad(this->fGm_[this->featOffset_], gradOutT, this->cpPadParams_);
+        }
+        this->featOffset_ += this->stride0_;
+    }
+    que_.FreeTensor(gradOutT); // set_flag(mte3, mte2)
+}
+} // namespace BEVPool
+
+extern "C" __global__ __aicore__ void bev_pool_grad(GM_ADDR gradOut, GM_ADDR geomFeat, GM_ADDR intervalLengths,
+    GM_ADDR intervalStarts, GM_ADDR gradFeat, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(bevPoolTiling, tiling);
+    int32_t blkIdx = GetBlockIdx();
+    int32_t c = bevPoolTiling.stride0; // channel
+
+#if __CCE_AICORE__ == 220
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+#endif
+    // the tiling key represented as below:
+    // +----+----+----+-----+
+    // |bf16|fp16|fp32|align|
+    // +----+----+----+-----+
+    if (TILING_KEY_IS(3)) { // 1 << BEVPool::TILING_FP32_BIT | BEVPool::TILING_ALIGN32B_FLAG
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPool::BEVPoolGradKernel<float, true> op(blkIdx, cBytes, divCeilC, alignUpCBytes, gradOut, geomFeat,
+            intervalLengths, intervalStarts, gradFeat, bevPoolTiling);
+        op.Process();
+    } else if (TILING_KEY_IS(2)) { // 1 << BEVPool::TILING_FP32_BIT
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPool::BEVPoolGradKernel<float, false> op(blkIdx, cBytes, divCeilC, alignUpCBytes, gradOut, geomFeat,
+            intervalLengths, intervalStarts, gradFeat, bevPoolTiling);
+        op.Process();
+    } // we just support fp32 at present
+    PipeBarrier<PIPE_ALL>();
+}
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.cpp b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a8f31a46895b36f8cfa30c101e9ba854d3b7233
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.cpp
@@ -0,0 +1,64 @@
+#include "bev_pool_v2.h"
+using namespace AscendC;
+
+namespace BEVPoolV2 {
+template<typename T, bool Align32B>
+__aicore__ inline void BEVPoolV2Kernel<T, Align32B>::DoProcess()
+{
+    LocalTensor<T> outT = outQue_.AllocTensor<T>(); // wait_flag(met3, v)
+    Duplicate(outT, T(0.f), this->alignUpCCount_);  // pipe_v
+    this->outOffset_ = this->rBGm_.GetValue(this->start_) * this->stride0_;
+    for (int32_t i = 0; i < this->length_; ++i) {
+        this->depthOffset_ = this->rDGm_.GetValue(this->start_ + i);
+        this->featOffset_ = this->rFGm_.GetValue(this->start_ + i) * this->stride0_;
+        T depth = this->dGm_.GetValue(this->depthOffset_);
+        LocalTensor<T> featT = this->featQue_.template AllocTensor<T>();              // wait_flag(v, mte2)
+        DataCopy(featT, this->fGm_[this->featOffset_], this->cpFeatParams_); // met2
+        this->featQue_.EnQue(featT);                                         // set_flag(mte2, v)
+        featT = this->featQue_.template DeQue<T>();                                   // wait_flag(mte2, v)
+        Muls(featT, featT, depth, this->alignUpCCount_);                     // pipe_v
+        Add(outT, featT, outT, this->alignUpCCount_);                        // pipe_v
+        this->featQue_.FreeTensor(featT);                                    // set_flag(v, mte2)
+    }
+    outQue_.EnQue(outT); // set_flag(v, mte3)
+
+    outT = outQue_.DeQue<T>(); // wait_flag(v, mte3)
+    if (Align32B) {
+        DataCopy(this->oGm_[this->outOffset_], outT, this->cpFeatParams_); // mte3
+    } else {
+        DataCopyPad(this->oGm_[this->outOffset_], outT, this->cpPadParams_); // mte3
+    }
+    outQue_.FreeTensor(outT); // set_flag(mte3, v)
+}
+} // namespace BEVPoolV2
+
+extern "C" __global__ __aicore__ void bev_pool_v2(GM_ADDR depth, GM_ADDR feat, GM_ADDR ranksDepth, GM_ADDR ranksFeat,
+    GM_ADDR ranksBev, GM_ADDR intervalLengths, GM_ADDR intervalStarts, GM_ADDR out, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(bevPoolTiling, tiling);
+    int32_t blkIdx = GetBlockIdx();
+    int32_t c = bevPoolTiling.stride0; // channel
+// the tiling key represented as below:
+// +----+----+----+-----+
+// |bf16|fp16|fp32|align|
+// +----+----+----+-----+
+#if __CCE_AICORE__ == 220
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+#endif
+    if (TILING_KEY_IS(3)) { // 1 << BEVPool::TILING_FP32_BIT | BEVPool::TILING_ALIGN32B_FLAG
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPoolV2::BEVPoolV2Kernel<float, true> op(blkIdx, cBytes, divCeilC, alignUpCBytes, depth, feat, ranksDepth,
+            ranksFeat, ranksBev, intervalLengths, intervalStarts, out, bevPoolTiling);
+        op.Process();
+    } else if (TILING_KEY_IS(2)) { // 1 << BEVPool::TILING_FP32_BIT
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPoolV2::BEVPoolV2Kernel<float, false> op(blkIdx, cBytes, divCeilC, alignUpCBytes, depth, feat, ranksDepth,
+            ranksFeat, ranksBev, intervalLengths, intervalStarts, out, bevPoolTiling);
+        op.Process();
+    } // we just support fp32 at present
+    PipeBarrier<PIPE_ALL>();
+}
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.h b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..883e58fb5ff5e36547a321cbb2f953eebbc05f59
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2.h
@@ -0,0 +1,136 @@
+#ifndef BEV_POOL_V2_H
+#define BEV_POOL_V2_H
+#include "common.h"
+
+namespace BEVPoolV2 {
+constexpr int32_t BUFFER_NUM = 2; // double buffer
+template<typename T, bool Align32B>
+class BEVPoolV2BaseKernel {
+public:
+    __aicore__ inline BEVPoolV2BaseKernel() = delete;
+
+    __aicore__ inline BEVPoolV2BaseKernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR depth, GM_ADDR feat, GM_ADDR ranksDepth, GM_ADDR ranksFeat, GM_ADDR ranksBev, GM_ADDR intervalLengths,
+        GM_ADDR intervalStarts, const BEVPoolTilingData& bevPoolTiling)
+        : blkIdx_(blkIdx), it_(blkIdx, bevPoolTiling.usedCoreNum, bevPoolTiling.avgTaskNum, bevPoolTiling.tailTaskNum,
+                               bevPoolTiling.totalTaskNum),
+          cpFeatParams_(1, divCeilC, 0, 0), cpOneParams_(1, 1, 0, 0), alignUpCCount_(alignUpCBytes / sizeof(T)),
+          cpPadParams_(1, cBytes, 0, 0, 0)
+    {
+        stride0_ = bevPoolTiling.stride0;
+        stride1_ = bevPoolTiling.stride1;
+        stride2_ = bevPoolTiling.stride2;
+        stride3_ = bevPoolTiling.stride3;
+
+        dGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(depth));
+        fGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(feat));
+        rDGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(ranksDepth));
+        rFGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(ranksFeat));
+        rBGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(ranksBev));
+        sGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(intervalStarts));
+        lGm_.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(intervalLengths));
+
+        pipe_.InitBuffer(featQue_, BUFFER_NUM, alignUpCBytes);
+    }
+
+protected:
+    int32_t blkIdx_;
+
+    AscendC::TPipe pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> featQue_;
+    AscendC::GlobalTensor<int32_t> rDGm_, rFGm_, rBGm_, sGm_, lGm_;
+    AscendC::GlobalTensor<T> dGm_, fGm_;
+
+    int32_t stride0_, stride1_, stride2_, stride3_;
+    int32_t alignUpCCount_;
+    TaskIterator it_;
+
+    AscendC::DataCopyParams cpFeatParams_, cpOneParams_;
+    AscendC::DataCopyExtParams cpPadParams_;
+    int32_t start_, length_, outOffset_, featOffset_, depthOffset_;
+
+    __aicore__ inline void PreProcess(int32_t idx)
+    {
+        length_ = lGm_.GetValue(idx);
+        start_ = sGm_.GetValue(idx);
+    }
+};
+
+template<typename T, bool Align32B>
+class BEVPoolV2Kernel : public BEVPoolV2BaseKernel<T, Align32B> {
+public:
+    __aicore__ inline BEVPoolV2Kernel() = delete;
+
+    __aicore__ inline BEVPoolV2Kernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR depth, GM_ADDR feat, GM_ADDR ranksDepth, GM_ADDR ranksFeat, GM_ADDR ranksBev, GM_ADDR intervalLengths,
+        GM_ADDR intervalStarts, GM_ADDR out, const BEVPoolTilingData& bevPoolTiling)
+        : BEVPoolV2BaseKernel<T, Align32B>(blkIdx, cBytes, divCeilC, alignUpCBytes, depth, feat, ranksDepth, ranksFeat,
+              ranksBev, intervalLengths, intervalStarts, bevPoolTiling)
+
+    {
+        oGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(out));
+        this->pipe_.InitBuffer(outQue_, BUFFER_NUM, alignUpCBytes);
+    }
+
+    __aicore__ inline void Process()
+    {
+        while (this->it_.HasNext()) {
+            const int32_t idx = this->it_.Next();
+            this->PreProcess(idx);
+            DoProcess();
+        }
+    }
+
+private:
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQue_;
+    AscendC::GlobalTensor<T> oGm_;
+
+    __aicore__ inline void DoProcess();
+};
+
+template<typename T, bool Align32B>
+class BEVPoolV2GradKernel : public BEVPoolV2BaseKernel<T, Align32B> {
+public:
+    __aicore__ inline BEVPoolV2GradKernel() = delete;
+
+    __aicore__ inline BEVPoolV2GradKernel(int32_t blkIdx, int32_t cBytes, int32_t divCeilC, int32_t alignUpCBytes,
+        GM_ADDR gradOut, GM_ADDR depth, GM_ADDR feat, GM_ADDR ranksDepth, GM_ADDR ranksFeat, GM_ADDR ranksBev,
+        GM_ADDR intervalLengths, GM_ADDR intervalStarts, GM_ADDR gradDepth, GM_ADDR gradFeat,
+        const BEVPoolTilingData& bevPoolTiling)
+        : BEVPoolV2BaseKernel<T, Align32B>(blkIdx, cBytes, divCeilC, alignUpCBytes, depth, feat, ranksDepth, ranksFeat,
+              ranksBev, intervalLengths, intervalStarts, bevPoolTiling),
+          cpDepthParams_(1, sizeof(T), 0, 0, 0)
+    {
+        gOGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(gradOut));
+        gDGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(gradDepth));
+        gFGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(gradFeat));
+
+        this->pipe_.InitBuffer(gradOutQue_, BUFFER_NUM, alignUpCBytes);
+        this->pipe_.InitBuffer(gradDepthQue_, BUFFER_NUM, AscendC::ONE_BLK_SIZE);
+        this->pipe_.InitBuffer(gradFeatQue_, BUFFER_NUM, alignUpCBytes);
+        this->pipe_.InitBuffer(workBuf_, alignUpCBytes);
+        workT_ = workBuf_.Get<T>();
+    }
+
+    __aicore__ inline void Process()
+    {
+        while (this->it_.HasNext()) {
+            const int32_t idx = this->it_.Next();
+            this->PreProcess(idx);
+            DoProcess();
+        }
+    }
+
+private:
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> gradOutQue_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> gradDepthQue_, gradFeatQue_;
+    AscendC::TBuf<AscendC::QuePosition::LCM> workBuf_;
+    AscendC::GlobalTensor<T> gOGm_, gDGm_, gFGm_;
+    AscendC::LocalTensor<T> workT_;
+    AscendC::DataCopyExtParams cpDepthParams_;
+
+    __aicore__ inline void DoProcess();
+};
+} // namespace BEVPoolV2
+
+#endif // BEV_POOL_V2_H
\ No newline at end of file
diff --git a/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2_grad.cpp b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2_grad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52e86dcfd0f5398312bdf7076e458b7421ac95e4
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/bev_pool_v2_grad.cpp
@@ -0,0 +1,82 @@
+#include "bev_pool_v2.h"
+using namespace AscendC;
+
+namespace BEVPoolV2 {
+template<typename T, bool Align32B>
+__aicore__ inline void BEVPoolV2GradKernel<T, Align32B>::DoProcess()
+{
+    LocalTensor<T> gradFeatT = gradFeatQue_.AllocTensor<T>(); // wait_flag(met3, v)
+    Duplicate(gradFeatT, T(0.f), this->alignUpCCount_);       // pipe_v
+    for (int32_t i = 0; i < this->length_; ++i) {
+        this->depthOffset_ = this->rDGm_.GetValue(this->start_ + i);
+        this->featOffset_ = this->rFGm_.GetValue(this->start_ + i) * this->stride0_;
+        this->outOffset_ = this->rBGm_.GetValue(this->start_ + i) * this->stride0_;
+        LocalTensor<T> featT = this->featQue_.template AllocTensor<T>();        // wait_flag(v, mte2)
+        LocalTensor<T> gradOutT = gradOutQue_.AllocTensor<T>();                 // wait_flag(v, mte2)
+        LocalTensor<T> gradDepthT = gradDepthQue_.AllocTensor<T>();             // wait_flag(met3, v)
+        DataCopy(featT, this->fGm_[this->featOffset_], this->cpFeatParams_);    // met2
+        DataCopy(gradOutT, this->gOGm_[this->outOffset_], this->cpFeatParams_); // met2
+        this->featQue_.EnQue(featT);                                            // set_flag(mte2, v)
+        gradOutQue_.EnQue(gradOutT);                                            // set_flag(mte2, v)
+        featT = this->featQue_.template DeQue<T>();                             // wait_flag(mte2, v)
+        gradOutT = gradOutQue_.DeQue<T>();                                      // wait_flag(mte2, v)
+
+        // calculate gradDepth, sum of feat * gradOut
+        Mul(featT, gradOutT, featT, this->alignUpCCount_);                        // pipe_v
+        ReduceSum(gradDepthT, featT, workT_, this->stride0_);                     // pipe_v
+        this->featQue_.FreeTensor(featT);                                         // set_flag(v, mte2)
+        gradDepthQue_.EnQue(gradDepthT);                                          // set_flag(v, mte3)
+        gradDepthT = gradDepthQue_.DeQue<T>();                                    // wait_flag(v, mte3)
+        DataCopyPad(gDGm_[this->depthOffset_], gradDepthT, this->cpDepthParams_); // mte3
+        gradDepthQue_.FreeTensor(gradDepthT);                                     // set_flag(mte3, v)
+
+        // calculate gradFeat, sum of depth * gradOut
+        T depth = this->dGm_.GetValue(this->depthOffset_);
+        Muls(gradOutT, gradOutT, depth, this->alignUpCCount_);     // pipe_v
+        Add(gradFeatT, gradFeatT, gradOutT, this->alignUpCCount_); // pipe_v
+        this->featQue_.FreeTensor(gradOutT);                       // set_flag(v, mte2)
+    }
+    gradFeatQue_.EnQue(gradFeatT); // set_flag(v, mte3)
+
+    gradFeatT = gradFeatQue_.DeQue<T>(); // wait_flag(v, mte3)
+    int32_t featOffset = this->rFGm_.GetValue(this->start_) * this->stride0_;
+    if (Align32B) {
+        DataCopy(this->gFGm_[featOffset], gradFeatT, this->cpFeatParams_); // mte3
+    } else {
+        DataCopyPad(this->gFGm_[featOffset], gradFeatT, this->cpPadParams_); // mte3
+    }
+    gradFeatQue_.FreeTensor(gradFeatT); // set_flag(mte3, v)
+}
+} // namespace BEVPoolV2
+
+extern "C" __global__ __aicore__ void bev_pool_v2_grad(GM_ADDR gradOut, GM_ADDR depth, GM_ADDR feat, GM_ADDR ranksDepth,
+    GM_ADDR ranksFeat, GM_ADDR ranksBev, GM_ADDR intervalLengths, GM_ADDR intervalStarts, GM_ADDR gradDepth,
+    GM_ADDR gradFeat, GM_ADDR workspace, GM_ADDR tiling)
+{
+    GET_TILING_DATA(bevPoolTiling, tiling);
+    int32_t blkIdx = GetBlockIdx();
+    int32_t c = bevPoolTiling.stride0; // channel
+// the tiling key represented as below:
+// +----+----+----+-----+
+// |bf16|fp16|fp32|align|
+// +----+----+----+-----+
+#if __CCE_AICORE__ == 220
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+#endif
+    if (TILING_KEY_IS(3)) { // 1 << BEVPool::TILING_FP32_BIT | BEVPool::TILING_ALIGN32B_FLAG
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPoolV2::BEVPoolV2GradKernel<float, true> op(blkIdx, cBytes, divCeilC, alignUpCBytes, gradOut, depth, feat,
+            ranksDepth, ranksFeat, ranksBev, intervalLengths, intervalStarts, gradDepth, gradFeat, bevPoolTiling);
+        op.Process();
+    } else if (TILING_KEY_IS(2)) { // 1 << BEVPool::TILING_FP32_BIT
+        const int32_t cBytes = c * sizeof(float);
+        const int32_t divCeilC = DivCeil(cBytes, ONE_BLK_SIZE);
+        const int32_t alignUpCBytes = divCeilC * ONE_BLK_SIZE;
+        BEVPoolV2::BEVPoolV2GradKernel<float, false> op(blkIdx, cBytes, divCeilC, alignUpCBytes, gradOut, depth, feat,
+            ranksDepth, ranksFeat, ranksBev, intervalLengths, intervalStarts, gradDepth, gradFeat, bevPoolTiling);
+        op.Process();
+    } // we just support fp32 at present
+    PipeBarrier<PIPE_ALL>();
+}
diff --git a/ads/perception/fused/ops/kernels/op_kernel/common.h b/ads/perception/fused/ops/kernels/op_kernel/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..2041af4985be2803dae6afeae4b2c56b59f1df1c
--- /dev/null
+++ b/ads/perception/fused/ops/kernels/op_kernel/common.h
@@ -0,0 +1,46 @@
+#ifndef COMMON_H_
+#define COMMON_H_
+
+#include "kernel_operator.h"
+
+constexpr int32_t TILING_ALIGN32B_FLAG = 1;
+constexpr int32_t TILING_FP32_BIT = 1;
+constexpr int32_t TILING_FP16_BIT = 2;
+constexpr int32_t TILING_BF16_BIT = 3;
+
+class TaskIterator {
+public:
+    __aicore__ inline TaskIterator(
+        int32_t blkIdx, int32_t blkDim, int32_t avgTaskNum, int32_t tailTaskNum, int32_t totalTaskNum)
+        : blkIdx_(blkIdx), blkDim_(blkDim), totalTaskNum_(totalTaskNum)
+    {
+        nextIdx_ = blkIdx * avgTaskNum + (blkIdx < tailTaskNum ? blkIdx : tailTaskNum);
+        endIdx_ = nextIdx_ + avgTaskNum + (blkIdx < tailTaskNum ? 1 : 0);
+    }
+
+    __aicore__ inline bool HasNext() const
+    {
+        return nextIdx_ < endIdx_;
+    }
+
+    __aicore__ inline int32_t Next()
+    {
+        return nextIdx_++;
+    }
+
+    __aicore__ inline int32_t GetNext() const
+    {
+        return nextIdx_;
+    }
+
+    __aicore__ inline int32_t GetTaskNum() const
+    {
+        return totalTaskNum_;
+    }
+
+private:
+    int32_t blkIdx_, blkDim_;
+    int32_t nextIdx_, endIdx_;
+    int32_t totalTaskNum_;
+};
+#endif // COMMON_H_
\ No newline at end of file
diff --git a/bind/pybind.cpp b/bind/pybind.cpp
index d0d5eba42e2617241c8092889ace8491d3175d01..7f8c8dc7a1ea151d76ce20574ad81c7d91311cdf 100644
--- a/bind/pybind.cpp
+++ b/bind/pybind.cpp
@@ -3,4 +3,5 @@
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     init_common(m);
+    init_perception_fused(m);
 }
diff --git a/include/csrc/common.h b/include/csrc/common.h
index 50d3f40cfac52f7bdf2b4dac74014c62d2368103..91d75039e9da9b373515431c20a64c5125e66dcd 100644
--- a/include/csrc/common.h
+++ b/include/csrc/common.h
@@ -214,4 +214,15 @@ inline void format_fresh_view(at::Tensor& x, const at::Tensor& y)
 {
     x.copy_(y);
 }
+
+inline bool is_npu(const at::Tensor& tensor)
+{
+#ifdef COMPILE_WITH_XLA
+    return tensor.device().type() == at::kXLA;
+#else
+    return tensor.device().type() == at::kPrivateUse1;
+#endif
+}
+
+#define TORCH_CHECK_NPU(tensor) TORCH_CHECK(is_npu(tensor), #tensor " must be NPU tensor")
 #endif // CSRC_COMMON_H_
diff --git a/include/csrc/pybind.h b/include/csrc/pybind.h
index f94419772b1c616712ee973038b31f66d19979a9..5218dc65d40e86e341c05363b5a52b8a6b90f248 100644
--- a/include/csrc/pybind.h
+++ b/include/csrc/pybind.h
@@ -19,7 +19,7 @@
 
 void init_common(pybind11::module& m);
 void init_motion(pybind11::module& m);
-void init_percention_fused(pybind11::module& m);
+void init_perception_fused(pybind11::module& m);
 void init_perception_point(pybind11::module& m);
 void init_perception_vision(pybind11::module& m);
 #endif // CSRC_PYBIND_H_
diff --git a/tests/torch/test_bev_pool.py b/tests/torch/test_bev_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdad3c572a5ee40fd4ffcd9f61689fbde96a0f09
--- /dev/null
+++ b/tests/torch/test_bev_pool.py
@@ -0,0 +1,63 @@
+import unittest
+import torch
+import numpy as np
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from ads.perception.fused import bev_pool
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def golden_bev_pool(feat, geom_feat, b, d, h, w, c):
+    output = np.zeros((b, d, h, w, c), dtype=np.float32)
+    ranks = geom_feat[:, 0] * (w * d * b) + geom_feat[:, 1] * (d * b) + geom_feat[:, 2] * b + geom_feat[:, 3]
+    indices = np.argsort(ranks)
+    feat, geom_feat, ranks = feat[indices], geom_feat[indices], ranks[indices]
+    kept = np.ones(feat.shape[0], dtype=bool)
+    kept[1:] = ranks[1:] != ranks[:-1]
+    interval_starts = np.where(kept)[0].astype(np.int32)
+    interval_lengths = np.zeros_like(interval_starts, dtype=np.int32)
+    interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+    interval_lengths[-1] = feat.shape[0] - interval_starts[-1]
+    for (start, length) in zip(interval_starts, interval_lengths):
+        geom = geom_feat[start]
+        for i in range(length):
+            output[geom[3], geom[2], geom[0], geom[1], :] += feat[start + i, :]
+    output = np.transpose(output, (0, 4, 1, 2, 3))
+    return output, interval_starts, interval_lengths
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def golden_bev_pool_grad(feat, geom_feat, interval_starts, interval_lengths, grad_output, b, d, h, w, c):
+    grad_feat = np.zeros_like(feat)
+    for (start, length) in zip(interval_starts, interval_lengths):
+        geom = geom_feat[start]
+        for i in range(length):
+            grad_feat[start + i, :] = grad_output[geom[3], geom[2], geom[0], geom[1], :]
+
+    return grad_feat
+
+
+def generate_bev_pool_data(n, c):
+    feat = np.random.rand(n, c).astype(np.float32)
+    geom_feat = np.random.randint(0, 32, (n, 4)).astype(np.int32)
+    out_shape = (32, 32, 32, 32, c)
+    return feat, geom_feat, out_shape
+
+
+class TestBEVPool(TestCase):
+    @unittest.skipIf(DEVICE_NAME != 'Ascend910B',
+                     "OP `bev_pool` is only supported on 910B, skip this ut!")
+    def test_bev_pool(self):
+        feat, geom_feat, out_shape = generate_bev_pool_data(1000, 64)
+        (b, d, h, w, c) = out_shape
+        feat_npu = torch.from_numpy(feat).npu()
+        geom_feat_npu = torch.from_numpy(geom_feat).npu()
+        out_npu = bev_pool(feat_npu, geom_feat_npu, b, d, h, w)
+        out_cpu, interval_starts, interval_lengths = golden_bev_pool(feat, geom_feat, b, d, h, w, c)
+
+        self.assertRtolEqual(out_cpu, out_npu.cpu().numpy())
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/tests/torch/test_bev_pool_v2.py b/tests/torch/test_bev_pool_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fbb61922e8ddfecc49e5a7dc54ead029d79dfe3
--- /dev/null
+++ b/tests/torch/test_bev_pool_v2.py
@@ -0,0 +1,84 @@
+import unittest
+import torch
+import numpy as np
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+from ads.perception.fused import bev_pool_v2
+
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def golden_bev_pool_v2(feat, geom_feat, b, d, h, w, c):
+    output = np.zeros((b, d, h, w, c), dtype=np.float32)
+    ranks = geom_feat[:, 0] * (w * d * b) + geom_feat[:, 1] * (d * b) + geom_feat[:, 2] * b + geom_feat[:, 3]
+    indices = np.argsort(ranks)
+    feat, geom_feat, ranks = feat[indices], geom_feat[indices], ranks[indices]
+    kept = np.ones(feat.shape[0], dtype=bool)
+    kept[1:] = ranks[1:] != ranks[:-1]
+    interval_starts = np.where(kept)[0].astype(np.int32)
+    interval_lengths = np.zeros_like(interval_starts, dtype=np.int32)
+    interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+    interval_lengths[-1] = feat.shape[0] - interval_starts[-1]
+    for (start, length) in zip(interval_starts, interval_lengths):
+        geom = geom_feat[start]
+        for i in range(length):
+            output[geom[3], geom[2], geom[0], geom[1], :] += feat[start + i, :]
+    output = np.transpose(output, (0, 4, 1, 2, 3))
+    return output, interval_starts, interval_lengths
+
+
+# pylint: disable=too-many-arguments,huawei-too-many-arguments
+def golden_bev_pool_grad(feat, geom_feat, interval_starts, interval_lengths, grad_output, b, d, h, w, c):
+    grad_feat = np.zeros_like(feat)
+    for (start, length) in zip(interval_starts, interval_lengths):
+        geom = geom_feat[start]
+        for i in range(length):
+            grad_feat[start + i, :] = grad_output[geom[3], geom[2], geom[0], geom[1], :]
+
+    return grad_feat
+
+
+def generate_bev_pool_data(n, c):
+    feat = np.random.rand(n, c).astype(np.float32)
+    geom_feat = np.random.randint(0, 32, (n, 4)).astype(np.int32)
+    out_shape = (32, 32, 32, 32, c)
+    return feat, geom_feat, out_shape
+
+
+class TestBEVPoolV2(TestCase):
+    @unittest.skipIf(DEVICE_NAME != 'Ascend910B',
+                     "OP `bev_pool` is only supported on 910B, skip this ut!")
+    def test_bev_pool_v2(self):
+        depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
+        depth = torch.from_numpy(depth).float().npu()
+        depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
+        feat = torch.ones(
+            size=[1, 1, 2, 2, 2], dtype=torch.float).npu()
+        feat.requires_grad_()
+        ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().npu()
+        ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().npu()
+        ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().npu()
+
+        kept = torch.ones(
+            ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+        kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+        interval_starts = torch.where(kept)[0].int()
+        interval_lengths = torch.zeros_like(interval_starts)
+        interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+        interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+        bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
+                               (1, 1, 2, 2, 2), interval_starts, interval_lengths)
+        loss = torch.sum(bev_feat)
+        loss.backward()
+        grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
+        grad_depth = torch.from_numpy(grad_depth).float()
+        grad_depth = grad_depth.npu().view(1, 1, 2, 2, 2)
+        self.assertRtolEqual(depth.grad.cpu().numpy(), grad_depth.cpu().numpy())
+        grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
+        grad_feat = torch.from_numpy(grad_feat).float().npu().view(1, 1, 2, 2, 2)
+        self.assertRtolEqual(feat.grad.cpu().numpy(), grad_feat.cpu().numpy())
+
+
+if __name__ == '__main__':
+    run_tests()