diff --git a/model_examples/MapTR/.gitignore b/model_examples/MapTR/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..16b77b82098599629a4433a8a8296c8a798e7bd3
--- /dev/null
+++ b/model_examples/MapTR/.gitignore
@@ -0,0 +1,147 @@
+/*.sh
+change_submit.py
+cluster_submit.yaml
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+work_dirs
+test
+val
+ckpts
+data
+.Python
+build/
+ckpts/
+data/
+ckpts
+data
+test/
+val/
+work_dirs/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+cluster.sh
diff --git a/model_examples/MapTR/LICENSE b/model_examples/MapTR/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..c0b2dc0c52bd8e21e4bd2d3504489b4b90ff113b
--- /dev/null
+++ b/model_examples/MapTR/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Hust Vision Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/model_examples/MapTR/README.md b/model_examples/MapTR/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e921a7a03d0584497571e4e1344d2f5b70b92366
--- /dev/null
+++ b/model_examples/MapTR/README.md
@@ -0,0 +1,191 @@
+# MapTR for PyTorch
+
+## 目录
+
+- [简介](#简介)
+  - [模型介绍](#模型介绍)
+  - [支持任务列表](#支持任务列表)
+  - [代码实现](#代码实现)
+- [MapTR](#MapTR)
+  - [准备训练环境](#准备训练环境)
+  - [快速开始](#快速开始)
+    - [训练任务](#训练任务)
+- [公网地址说明](#公网地址说明)
+- [变更说明](#变更说明)
+- [FAQ](#FAQ)
+
+# 简介
+
+## 模型介绍
+
+MapTR是一种高效的端到端Transformer模型，用于在线构建矢量化高清地图（HD Map）。高清地图在自动驾驶系统中是规划的基础和关键组件，提供了丰富而精确的环境信息。MapTR提出了一种统一的置换等价建模方法，将地图元素表示为等价置换组的点集，这样不仅可以准确描述地图元素的形状，还能稳定学习过程。此外，MapTR设计了一个分层查询嵌入方案，以灵活地编码结构化地图信息，并执行分层二分匹配来学习地图元素。
+
+## 支持任务列表
+
+本仓已经支持以下模型任务类型
+
+|    模型     | 任务列表 | 是否支持 |
+| :---------: | :------: | :------: |
+| MapTR |   训练   |    ✔     |
+
+## 代码实现
+
+- 参考实现：
+  
+  ```
+  url=https://github.com/hustvl/MapTR
+  commit_id=fa420a2e756c9e19b876bdf2f6d33a097d84be73
+  ```
+
+# MapTR
+
+## 准备训练环境
+
+### 安装环境
+
+**表 1**  三方库版本支持表
+
+| 三方库  | 支持版本 |
+| :-----: | :------: |
+| PyTorch |   2.1   |
+
+### 安装昇腾环境
+
+请参考昇腾社区中《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》文档搭建昇腾环境，本仓已支持表2中软件版本。
+
+**表 2**  昇腾软件版本支持表
+
+|     软件类型      | 支持版本 |
+| :---------------: | :------: |
+| FrameworkPTAdaper | 6.0.RC3  |
+|       CANN        | 8.0.RC3  |
+|    昇腾NPU固件    | 24.1.RC3 |
+|    昇腾NPU驱动    | 24.1.RC3 |
+
+- 安装mmdet3d
+  
+  - 在模型根目录下，克隆mmdet3d仓，并进入mmdetection3d目录
+    
+    ```
+    git clone -b v1.0.0rc4 https://github.com/open-mmlab/mmdetection3d.git
+    cd mmdetection3d
+    ```
+  - 在mmdetection3d目录下，修改代码
+    
+    （1）删除requirements/runtime.txt中第3行 numba==0.53.0
+    
+    （2）修改mmdet3d/____init____.py中第22行 mmcv_maximum_version = '1.7.0'为mmcv_maximum_version = '1.7.2'
+  - 安装包
+    
+    ```
+    pip install -v -e .
+    ```
+- 安装mmcv
+  
+  - 在模型根目录下，克隆mmcv仓，并进入mmcv目录安装
+    
+    ```
+    git clone -b 1.x https://github.com/open-mmlab/mmcv
+    cd mmcv
+    MMCV_WITH_OPS=1 pip install -e . -v
+
+    MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext
+    MMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop
+    ```
+- 安装mxDriving加速库，安装branch_v6.0.0-RC3分支，具体方法参考[原仓](https://gitee.com/ascend/mxDriving)。
+- 在模型根目录下执行以下命令，安装模型对应PyTorch版本需要的依赖。
+  
+  ```
+  pip install -r requirement.txt
+  ```
+- 在当前python环境下执行`pip show pip`，得到三方包安装路径Location，记作location_path，在模型根目录下执行以下命令来替换patch。
+  
+  ```
+  bash replace_patch.sh --packages_path=location_path
+  ```
+
+### 准备数据集
+
+- 根据原仓**Prepare Dataset**章节准备数据集，数据集目录及结构如下：
+
+```
+MapTR
+├── ckpts/
+│   ├── resnet50-19c8e357.pth
+├── data/
+│   ├── can_bus/
+│   ├── nuscenes/
+│   │   ├── lidarseg/
+│   │   ├── maps/
+│   │   ├── panoptic/
+│   │   ├── samples/
+│   │   ├── v1.0-test/
+|   |   ├── v1.0-trainval/
+|   |   ├── nuscenes_infos_temporal_test_mono3d.coco.json
+|   |   ├── nuscenes_infos_temporal_train_mono3d.coco.json
+|   |   ├── nuscenes_infos_temporal_val_mono3d.coco.json
+|   |   ├── nuscenes_map_anns_val.json
+|   |   ├── nuscenes_infos_temporal_test.pkl
+|   |   ├── nuscenes_infos_temporal_train.pkl
+|   |   ├── nuscenes_infos_temporal_val.pkl
+├── patch/
+├── projects/
+├── test/
+├── tools/
+```
+
+> **说明：**
+> nuscenes数据集下的文件，通过运行以下指令生成：
+```
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes --version v1.0 --canbus ./data
+```
+
+### 准备预训练权重
+
+- 在模型根目录下，执行以下指令下载预训练权重：
+```
+mkdir ckpts
+cd ckpts 
+wget https://download.pytorch.org/models/resnet50-19c8e357.pth
+```
+
+## 快速开始
+
+### 训练任务
+
+本任务主要提供单机的8卡训练脚本。
+
+#### 开始训练
+
+1. 在模型根目录下，运行训练脚本。
+   
+   该模型支持单机8卡训练。
+   
+   - 单机8卡精度训练
+   
+   ```
+   bash test/train_8p.sh
+   ```
+   
+   - 单机8卡性能训练
+   
+   ```
+   bash test/train_8p_performance.sh
+   ```
+
+#### 训练结果
+
+| 芯片          | 卡数 | global batch size | Precision | epoch |  mAP  | 性能-单步迭代耗时(ms) |
+| ------------- | :--: | :---------------: | :-------: | :---: | :----: | :-------------------: |
+| 竞品A           |  8p  |         32         |   fp32    |  24   | 48.7 |         710          |
+| Atlas 800T A2 |  8p  |         32         |   fp32    |  24   | 48.5 |         1100          |
+
+
+# 变更说明
+
+2024.11.08：首次发布
+
+
+# FAQ
+
+无
\ No newline at end of file
diff --git a/model_examples/MapTR/patch/mmcv/_functions.py b/model_examples/MapTR/patch/mmcv/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..460c42a3f622a0340be407e04a5f1f295491b0d4
--- /dev/null
+++ b/model_examples/MapTR/patch/mmcv/_functions.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from torch import Tensor
+from torch.nn.parallel._functions import _get_stream
+
+
+def scatter(input: Union[List, Tensor],
+            devices: List,
+            streams: Optional[List] = None) -> Union[List, Tensor]:
+    """Scatters tensor across multiple GPUs."""
+    if streams is None:
+        streams = [None] * len(devices)
+
+    if isinstance(input, list):
+        chunk_size = (len(input) - 1) // len(devices) + 1
+        outputs = [
+            scatter(input[i], [devices[i // chunk_size]],
+                    [streams[i // chunk_size]]) for i in range(len(input))
+        ]
+        return outputs
+    elif isinstance(input, Tensor):
+        output = input.contiguous()
+        # TODO: copy to a pinned buffer first (if copying from CPU)
+        stream = streams[0] if output.numel() > 0 else None
+        if devices != [-1]:
+            with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
+                output = output.cuda(devices[0], non_blocking=True)
+
+        return output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+def synchronize_stream(output: Union[List, Tensor], devices: List,
+                       streams: List) -> None:
+    if isinstance(output, list):
+        chunk_size = len(output) // len(devices)
+        for i in range(len(devices)):
+            for j in range(chunk_size):
+                synchronize_stream(output[i * chunk_size + j], [devices[i]],
+                                   [streams[i]])
+    elif isinstance(output, Tensor):
+        if output.numel() != 0:
+            with torch.cuda.device(devices[0]):
+                main_stream = torch.cuda.current_stream()
+                main_stream.wait_stream(streams[0])
+                output.record_stream(main_stream)
+    else:
+        raise Exception(f'Unknown type {type(output)}.')
+
+
+def get_input_device(input: Union[List, Tensor]) -> int:
+    if isinstance(input, list):
+        for item in input:
+            input_device = get_input_device(item)
+            if input_device != -1:
+                return input_device
+        return -1
+    elif isinstance(input, Tensor):
+        return input.get_device() if input.is_cuda else -1
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_gpus: List[int], input: Union[List, Tensor]) -> tuple:
+        input_device = get_input_device(input)
+        streams = None
+        if input_device == -1 and target_gpus != [-1]:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(torch.device("cuda", device)) for device in target_gpus]
+
+        outputs = scatter(input, target_gpus, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            synchronize_stream(outputs, target_gpus, streams)
+
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
\ No newline at end of file
diff --git a/model_examples/MapTR/patch/mmcv/deform_conv.py b/model_examples/MapTR/patch/mmcv/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f03a3d216015be70ba4fa096aaca5d5c6ad39a
--- /dev/null
+++ b/model_examples/MapTR/patch/mmcv/deform_conv.py
@@ -0,0 +1,500 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch_npu
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import IS_MLU_AVAILABLE, deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+from .modulated_deform_conv import ModulatedDeformConv2dFunction
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def _npu_backward(ctx, grad_output):
+        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
+            ctx.saved_tensors
+        grad_input, grad_weight, grad_offset_all, grad_bias = \
+            torch_npu.npu_deformable_conv2dbk( #3 torch.npu_deformable_conv2dbk
+                input_tensor, grad_output, offset_out, weight, offset_all,
+                kernel_size=[weight.shape[3], weight.shape[2]],
+                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
+                         ctx.padding[1]],
+                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+                groups=ctx.groups, deformable_groups=ctx.deform_groups,
+                modulated=True)
+        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+        ctx.device = input.device.type
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        if ctx.device == 'npu':
+            mask_shape, _ = torch.chunk(offset, 2, dim=1)
+            mask = torch.ones_like(mask_shape).to(input.device)
+            bias = input.new_empty(0)
+            output = ModulatedDeformConv2dFunction._npu_forward(
+                ctx, input, offset, mask, weight, bias)
+            return output
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
+        if ctx.device == 'npu':
+            return DeformConv2dFunction._npu_backward(ctx, grad_output)
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super().__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@CONV_LAYERS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+
+if IS_MLU_AVAILABLE:
+    import torchvision
+    from torchvision.ops import deform_conv2d as tv_deform_conv2d
+
+    from mmcv.utils import digit_version
+
+    @CONV_LAYERS.register_module('DCN', force=True)
+    class DeformConv2dPack_MLU(DeformConv2d):
+        """This class is the DCN implementation of the MLU device. The MLU
+        backend support of the operator has been implemented in torchvision.
+        The mmcv registration mechanism is used for multiplexing here. The
+        torchvision implementation of DCN is called.
+
+        Args:
+            in_channels (int): Same as nn.Conv2d.
+            out_channels (int): Same as nn.Conv2d.
+            kernel_size (int or tuple[int]): Same as nn.Conv2d.
+            stride (int): Same as nn.Conv2d, while tuple is not supported.
+            padding (int): Same as nn.Conv2d, while tuple is not supported.
+            dilation (int): Same as nn.Conv2d, while tuple is not supported.
+            groups (int): Same as nn.Conv2d.
+            bias (bool or str): If specified as `auto`, it will be decided by
+                the norm_cfg. Bias will be set as True if norm_cfg is None,
+                otherwise False.
+            im2col_step (int): Number of samples processed by
+                im2col_cuda_kernel per call. It will work when ``batch_size``
+                > ``im2col_step``, but ``batch_size`` must be divisible by
+                ``im2col_step``. Default: 32. `New in version 1.7.2.
+                Currently not supported on MLU devices.`
+        """
+
+        def __init__(self, *args, **kwargs):
+            assert digit_version(torchvision.__version__) >= digit_version(
+                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
+            super().__init__(*args, **kwargs)
+
+            self.conv_offset = nn.Conv2d(
+                self.in_channels,
+                self.deform_groups * 2 * self.kernel_size[0] *
+                self.kernel_size[1],
+                kernel_size=self.kernel_size,
+                stride=_pair(self.stride),
+                padding=_pair(self.padding),
+                dilation=_pair(self.dilation),
+                bias=True)
+            self.init_offset()
+
+        def init_offset(self):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+        def forward(self, x: Tensor) -> Tensor:  # type: ignore
+            cur_im2col_step = min(self.im2col_step, x.size(0))
+            assert (x.size(0) % cur_im2col_step
+                    ) == 0, 'batch size must be divisible by im2col_step'
+            offset = self.conv_offset(x)
+            x = x.type_as(offset)
+            weight = self.weight.type_as(x)
+            return tv_deform_conv2d(x, offset, weight, None, self.stride,
+                                    self.padding, self.dilation)
diff --git a/model_examples/MapTR/patch/mmcv/distributed.py b/model_examples/MapTR/patch/mmcv/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0dfecc9c2e2cfbc8e7173c31fcce169e3e61388
--- /dev/null
+++ b/model_examples/MapTR/patch/mmcv/distributed.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple
+
+import torch
+from torch.nn.parallel.distributed import (DistributedDataParallel,
+                                           _find_tensors)
+
+from mmcv import print_log
+from mmcv.utils import TORCH_VERSION, digit_version
+from .scatter_gather import ScatterInputs, scatter_kwargs
+
+
+class MMDistributedDataParallel(DistributedDataParallel):
+    """The DDP module that supports DataContainer.
+
+    MMDDP has two main differences with PyTorch DDP:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data.
+    - It implement two APIs ``train_step()`` and ``val_step()``.
+    """
+
+    def to_kwargs(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                  device_id: int) -> Tuple[tuple, tuple]:
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        """train_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.train_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.train_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.train_step(*inputs, **kwargs)
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """val_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.val_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.val_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.val_step(*inputs, **kwargs)
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def _run_ddp_forward(self, *inputs, **kwargs) -> Any:
+        """Processes inputs and runs ``self.module.forward``.
+
+        Pytorch 1.12.0 performs ``self.module.forward`` in ``_run_ddp_forward``
+        and deprecates using ``DistributedDataParallel.to_kwargs`` to
+        process inputs, which leads to inputs cannot be processed by
+        :meth:`MMDistributedDataParallel.to_kwargs` anymore. Therefore,
+        ``MMDistributedDataParallel`` overrides this method to call
+        :meth:`to_kwargs` explicitly.
+
+        See more information in `<https://github.com/open-mmlab/mmsegmentation/issues/1742>`_.  # noqa: E501
+
+        Returns:
+            Any: Forward result of :attr:`module`.
+        """
+        module_to_run = self.module
+
+        if self.device_ids:
+            inputs, kwargs = self.to_kwargs(  # type: ignore
+                inputs, kwargs, self.device_ids[0])
+            return module_to_run(*inputs[0], **kwargs[0])  # type: ignore
+        else:
+            return module_to_run(*inputs, **kwargs)
diff --git a/model_examples/MapTR/patch/mmdet/__init__.py b/model_examples/MapTR/patch/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0ed2efe47b5b370f6b664936052ab663fd6449
--- /dev/null
+++ b/model_examples/MapTR/patch/mmdet/__init__.py
@@ -0,0 +1,28 @@
+import mmcv
+
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.8'
+mmcv_maximum_version = '1.7.2'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/model_examples/MapTR/patch/mmdet/resnet.py b/model_examples/MapTR/patch/mmdet/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e2da57a4786bef50afcf257b1aeea91cf3448c3
--- /dev/null
+++ b/model_examples/MapTR/patch/mmdet/resnet.py
@@ -0,0 +1,675 @@
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+from mx_driving.fused import npu_add_relu, npu_max_pool2d #3
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            # out += identity #3
+            out = npu_add_relu(out, identity) #3
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        # out = self.relu(out) #3
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            # out += identity #3
+            out = npu_add_relu(out, identity) #3
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        # out = self.relu(out) #3
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        # x = self.maxpool(x) #3
+        x = npu_max_pool2d(x, 3, 2, 1) #3
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/model_examples/MapTR/patch/mmdet3d/__init__.py b/model_examples/MapTR/patch/mmdet3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..870d7db0544318c6e47a511d914e0795a7554f5d
--- /dev/null
+++ b/model_examples/MapTR/patch/mmdet3d/__init__.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+import mmdet
+import mmseg
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.5.2'
+mmcv_maximum_version = '1.7.2'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+mmdet_minimum_version = '2.14.0'
+mmdet_maximum_version = '3.0.0'
+mmdet_version = digit_version(mmdet.__version__)
+assert (mmdet_version >= digit_version(mmdet_minimum_version)
+        and mmdet_version <= digit_version(mmdet_maximum_version)), \
+    f'MMDET=={mmdet.__version__} is used but incompatible. ' \
+    f'Please install mmdet>={mmdet_minimum_version}, ' \
+    f'<={mmdet_maximum_version}.'
+
+mmseg_minimum_version = '0.14.1'
+mmseg_maximum_version = '1.0.0'
+mmseg_version = digit_version(mmseg.__version__)
+assert (mmseg_version >= digit_version(mmseg_minimum_version)
+        and mmseg_version <= digit_version(mmseg_maximum_version)), \
+    f'MMSEG=={mmseg.__version__} is used but incompatible. ' \
+    f'Please install mmseg>={mmseg_minimum_version}, ' \
+    f'<={mmseg_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/model_examples/MapTR/patch/mmseg/__init__.py b/model_examples/MapTR/patch/mmseg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ed005aab6048cc8aad4ffdf7df0be4797253b6d
--- /dev/null
+++ b/model_examples/MapTR/patch/mmseg/__init__.py
@@ -0,0 +1,30 @@
+import mmcv
+
+from .version import __version__, version_info
+
+MMCV_MIN = '1.3.1'
+MMCV_MAX = '1.7.2'
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_min_version = digit_version(MMCV_MIN)
+mmcv_max_version = digit_version(MMCV_MAX)
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_min_version}, <={mmcv_max_version}.'
+
+__all__ = ['__version__', 'version_info']
diff --git a/model_examples/MapTR/patch/nuscenes/data_classes.py b/model_examples/MapTR/patch/nuscenes/data_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95d6ce2738dca7ea37c99b08c48eb289e3c9844
--- /dev/null
+++ b/model_examples/MapTR/patch/nuscenes/data_classes.py
@@ -0,0 +1,425 @@
+# nuScenes dev-kit.
+# Code written by Oscar Beijbom, 2019.
+
+from collections import defaultdict
+from typing import List, Dict, Tuple
+
+import numpy as np
+
+from nuscenes.eval.common.data_classes import MetricData, EvalBox
+from nuscenes.eval.common.utils import center_distance
+from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES, TP_METRICS
+
+
+class DetectionConfig:
+    """ Data class that specifies the detection evaluation settings. """
+
+    def __init__(self,
+                 class_range: Dict[str, int],
+                 dist_fcn: str,
+                 dist_ths: List[float],
+                 dist_th_tp: float,
+                 min_recall: float,
+                 min_precision: float,
+                 max_boxes_per_sample: int,
+                 mean_ap_weight: int):
+
+        assert set(class_range.keys()) == set(DETECTION_NAMES), "Class count mismatch."
+        assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths."
+
+        self.class_range = class_range
+        self.dist_fcn = dist_fcn
+        self.dist_ths = dist_ths
+        self.dist_th_tp = dist_th_tp
+        self.min_recall = min_recall
+        self.min_precision = min_precision
+        self.max_boxes_per_sample = max_boxes_per_sample
+        self.mean_ap_weight = mean_ap_weight
+
+        self.class_names = self.class_range.keys()
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'class_range': self.class_range,
+            'dist_fcn': self.dist_fcn,
+            'dist_ths': self.dist_ths,
+            'dist_th_tp': self.dist_th_tp,
+            'min_recall': self.min_recall,
+            'min_precision': self.min_precision,
+            'max_boxes_per_sample': self.max_boxes_per_sample,
+            'mean_ap_weight': self.mean_ap_weight
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+        return cls(content['class_range'],
+                   content['dist_fcn'],
+                   content['dist_ths'],
+                   content['dist_th_tp'],
+                   content['min_recall'],
+                   content['min_precision'],
+                   content['max_boxes_per_sample'],
+                   content['mean_ap_weight'])
+
+    @property
+    def dist_fcn_callable(self):
+        """ Return the distance function corresponding to the dist_fcn string. """
+        if self.dist_fcn == 'center_distance':
+            return center_distance
+        else:
+            raise Exception('Error: Unknown distance function %s!' % self.dist_fcn)
+
+
+class DetectionMetricData(MetricData):
+    """ This class holds accumulated and interpolated data required to calculate the detection metrics. """
+
+    nelem = 101
+
+    def __init__(self,
+                 recall: np.array,
+                 precision: np.array,
+                 confidence: np.array,
+                 trans_err: np.array,
+                 vel_err: np.array,
+                 scale_err: np.array,
+                 orient_err: np.array,
+                 attr_err: np.array):
+
+        # Assert lengths.
+        assert len(recall) == self.nelem
+        assert len(precision) == self.nelem
+        assert len(confidence) == self.nelem
+        assert len(trans_err) == self.nelem
+        assert len(vel_err) == self.nelem
+        assert len(scale_err) == self.nelem
+        assert len(orient_err) == self.nelem
+        assert len(attr_err) == self.nelem
+
+        # Assert ordering.
+        assert all(confidence == sorted(confidence, reverse=True))  # Confidences should be descending.
+        assert all(recall == sorted(recall))  # Recalls should be ascending.
+
+        # Set attributes explicitly to help IDEs figure out what is going on.
+        self.recall = recall
+        self.precision = precision
+        self.confidence = confidence
+        self.trans_err = trans_err
+        self.vel_err = vel_err
+        self.scale_err = scale_err
+        self.orient_err = orient_err
+        self.attr_err = attr_err
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    @property
+    def max_recall_ind(self):
+        """ Returns index of max recall achieved. """
+
+        # Last instance of confidence > 0 is index of max achieved recall.
+        non_zero = np.nonzero(self.confidence)[0]
+        if len(non_zero) == 0:  # If there are no matches, all the confidence values will be zero.
+            max_recall_ind = 0
+        else:
+            max_recall_ind = non_zero[-1]
+
+        return max_recall_ind
+
+    @property
+    def max_recall(self):
+        """ Returns max recall achieved. """
+
+        return self.recall[self.max_recall_ind]
+
+    def serialize(self):
+        """ Serialize instance into json-friendly format. """
+        return {
+            'recall': self.recall.tolist(),
+            'precision': self.precision.tolist(),
+            'confidence': self.confidence.tolist(),
+            'trans_err': self.trans_err.tolist(),
+            'vel_err': self.vel_err.tolist(),
+            'scale_err': self.scale_err.tolist(),
+            'orient_err': self.orient_err.tolist(),
+            'attr_err': self.attr_err.tolist(),
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(recall=np.array(content['recall']),
+                   precision=np.array(content['precision']),
+                   confidence=np.array(content['confidence']),
+                   trans_err=np.array(content['trans_err']),
+                   vel_err=np.array(content['vel_err']),
+                   scale_err=np.array(content['scale_err']),
+                   orient_err=np.array(content['orient_err']),
+                   attr_err=np.array(content['attr_err']))
+
+    @classmethod
+    def no_predictions(cls):
+        """ Returns a md instance corresponding to having no predictions. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.zeros(cls.nelem),
+                   confidence=np.zeros(cls.nelem),
+                   trans_err=np.ones(cls.nelem),
+                   vel_err=np.ones(cls.nelem),
+                   scale_err=np.ones(cls.nelem),
+                   orient_err=np.ones(cls.nelem),
+                   attr_err=np.ones(cls.nelem))
+
+    @classmethod
+    def random_md(cls):
+        """ Returns an md instance corresponding to a random results. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.random.random(cls.nelem),
+                   confidence=np.linspace(0, 1, cls.nelem)[::-1],
+                   trans_err=np.random.random(cls.nelem),
+                   vel_err=np.random.random(cls.nelem),
+                   scale_err=np.random.random(cls.nelem),
+                   orient_err=np.random.random(cls.nelem),
+                   attr_err=np.random.random(cls.nelem))
+
+
+class DetectionMetrics:
+    """ Stores average precision and true positive metric results. Provides properties to summarize. """
+
+    def __init__(self, cfg: DetectionConfig):
+
+        self.cfg = cfg
+        self._label_aps = defaultdict(lambda: defaultdict(float))
+        self._label_tp_errors = defaultdict(lambda: defaultdict(float))
+        self.eval_time = None
+
+    def add_label_ap(self, detection_name: str, dist_th: float, ap: float) -> None:
+        self._label_aps[detection_name][dist_th] = ap
+
+    def get_label_ap(self, detection_name: str, dist_th: float) -> float:
+        return self._label_aps[detection_name][dist_th]
+
+    def add_label_tp(self, detection_name: str, metric_name: str, tp: float):
+        self._label_tp_errors[detection_name][metric_name] = tp
+
+    def get_label_tp(self, detection_name: str, metric_name: str) -> float:
+        return self._label_tp_errors[detection_name][metric_name]
+
+    def add_runtime(self, eval_time: float) -> None:
+        self.eval_time = eval_time
+
+    @property
+    def mean_dist_aps(self) -> Dict[str, float]:
+        """ Calculates the mean over distance thresholds for each label. """
+        return {class_name: np.mean(list(d.values())) for class_name, d in self._label_aps.items()}
+
+    @property
+    def mean_ap(self) -> float:
+        """ Calculates the mean AP by averaging over distance thresholds and classes. """
+        return float(np.mean(list(self.mean_dist_aps.values())))
+
+    @property
+    def tp_errors(self) -> Dict[str, float]:
+        """ Calculates the mean true positive error across all classes for each metric. """
+        errors = {}
+        for metric_name in TP_METRICS:
+            class_errors = []
+            for detection_name in self.cfg.class_names:
+                class_errors.append(self.get_label_tp(detection_name, metric_name))
+
+            errors[metric_name] = float(np.nanmean(class_errors))
+
+        return errors
+
+    @property
+    def tp_scores(self) -> Dict[str, float]:
+        scores = {}
+        tp_errors = self.tp_errors
+        for metric_name in TP_METRICS:
+
+            # We convert the true positive errors to "scores" by 1-error.
+            score = 1.0 - tp_errors[metric_name]
+
+            # Some of the true positive errors are unbounded, so we bound the scores to min 0.
+            score = max(0.0, score)
+
+            scores[metric_name] = score
+
+        return scores
+
+    @property
+    def nd_score(self) -> float:
+        """
+        Compute the nuScenes detection score (NDS, weighted sum of the individual scores).
+        :return: The NDS.
+        """
+        # Summarize.
+        total = float(self.cfg.mean_ap_weight * self.mean_ap + np.sum(list(self.tp_scores.values())))
+
+        # Normalize.
+        total = total / float(self.cfg.mean_ap_weight + len(self.tp_scores.keys()))
+
+        return total
+
+    def serialize(self):
+        return {
+            'label_aps': self._label_aps,
+            'mean_dist_aps': self.mean_dist_aps,
+            'mean_ap': self.mean_ap,
+            'label_tp_errors': self._label_tp_errors,
+            'tp_errors': self.tp_errors,
+            'tp_scores': self.tp_scores,
+            'nd_score': self.nd_score,
+            'eval_time': self.eval_time,
+            'cfg': self.cfg.serialize()
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+
+        cfg = DetectionConfig.deserialize(content['cfg'])
+
+        metrics = cls(cfg=cfg)
+        metrics.add_runtime(content['eval_time'])
+
+        for detection_name, label_aps in content['label_aps'].items():
+            for dist_th, ap in label_aps.items():
+                metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap))
+
+        for detection_name, label_tps in content['label_tp_errors'].items():
+            for metric_name, tp in label_tps.items():
+                metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp))
+
+        return metrics
+
+    def __eq__(self, other):
+        eq = True
+        eq = eq and self._label_aps == other._label_aps
+        eq = eq and self._label_tp_errors == other._label_tp_errors
+        eq = eq and self.eval_time == other.eval_time
+        eq = eq and self.cfg == other.cfg
+
+        return eq
+
+
+class DetectionBox(EvalBox):
+    """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+    def __init__(self,
+                 sample_token: str = "",
+                 translation: Tuple[float, float, float] = (0, 0, 0),
+                 size: Tuple[float, float, float] = (0, 0, 0),
+                 rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+                 velocity: Tuple[float, float] = (0, 0),
+                 ego_translation: [float, float, float] = (0, 0, 0),  # Translation to ego vehicle in meters.
+                 num_pts: int = -1,  # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+                 detection_name: str = 'car',  # The class name used in the detection challenge.
+                 detection_score: float = -1.0,  # GT samples do not have a score.
+                 attribute_name: str = ''):  # Box attribute. Each box can have at most 1 attribute.
+
+        super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+
+        assert detection_name is not None, 'Error: detection_name cannot be empty!'
+        assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+        assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+            'Error: Unknown attribute_name %s' % attribute_name
+
+        assert isinstance(detection_score, float), 'Error: detection_score must be a float!'
+        assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+        # Assign.
+        self.detection_name = detection_name
+        self.detection_score = detection_score
+        self.attribute_name = attribute_name
+
+    def __eq__(self, other):
+        return (self.sample_token == other.sample_token and
+                self.translation == other.translation and
+                self.size == other.size and
+                self.rotation == other.rotation and
+                self.velocity == other.velocity and
+                self.ego_translation == other.ego_translation and
+                self.num_pts == other.num_pts and
+                self.detection_name == other.detection_name and
+                self.detection_score == other.detection_score and
+                self.attribute_name == other.attribute_name)
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(sample_token=content['sample_token'],
+                   translation=tuple(content['translation']),
+                   size=tuple(content['size']),
+                   rotation=tuple(content['rotation']),
+                   velocity=tuple(content['velocity']),
+                   ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                   else tuple(content['ego_translation']),
+                   num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                   detection_name=content['detection_name'],
+                   detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                   attribute_name=content['attribute_name'])
+
+
+class DetectionMetricDataList:
+    """ This stores a set of MetricData in a dict indexed by (name, match-distance). """
+
+    def __init__(self):
+        self.md = {}
+
+    def __getitem__(self, key):
+        return self.md[key]
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.md.keys():
+            eq = eq and self[key] == other[key]
+        return eq
+
+    def get_class_data(self, detection_name: str) -> List[Tuple[DetectionMetricData, float]]:
+        """ Get all the MetricData entries for a certain detection_name. """
+        return [(md, dist_th) for (name, dist_th), md in self.md.items() if name == detection_name]
+
+    def get_dist_data(self, dist_th: float) -> List[Tuple[DetectionMetricData, str]]:
+        """ Get all the MetricData entries for a certain match_distance. """
+        return [(md, detection_name) for (detection_name, dist), md in self.md.items() if dist == dist_th]
+
+    def set(self, detection_name: str, match_distance: float, data: DetectionMetricData):
+        """ Sets the MetricData entry for a certain detection_name and match_distance. """
+        self.md[(detection_name, match_distance)] = data
+
+    def serialize(self) -> dict:
+        return {key[0] + ':' + str(key[1]): value.serialize() for key, value in self.md.items()}
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        mdl = cls()
+        for key, md in content.items():
+            name, distance = key.split(':')
+            mdl.set(name, float(distance), DetectionMetricData.deserialize(md))
+        return mdl
diff --git a/model_examples/MapTR/projects/__init__.py b/model_examples/MapTR/projects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py b/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6ea4f4562a8118275a444879a884717b55caa15
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..1822af4209432eb45e105112a165668fac87b6c5
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e81226e2dfdb0e4e802daa8bf0c9f9d19adb125
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..71baff04c5b5345ab3d7340607c3496a8befc5fa
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py b/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..82fce56bf6f2ad2578a0426e71fc13c2feb8bf97
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..154817175df8de5768c1d56bc35efaa0da99415c
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1363a94ce4fbb3b1014e61dd52bc36408f119ce1
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=False,
+        transforms=[
+            dict(type='RandomFlip3D'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img']),
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        box_type_3d='Camera'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py b/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa63ea3f0d351198d609785d971c19d96532844
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000000000000000000000000000000000000..2422766fa351ee5cf7f0cd5ee5ab61b88e1d0300
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        # following ScanNet dataset the rotation range is 5 degrees
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=False,
+                    classes=class_names,
+                    box_type_3d='Depth') for i in train_area
+            ],
+            separate_eval=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000000000000000000000000000000000000..39bf5568e01d1a781c1b712e7c20b823e7c90141
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    # train on area 1, 2, 3, 4, 6
+    # test on area 5
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[
+            data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+        ],
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+            for i in train_area
+        ]),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names),
+        scene_idxs=data_root +
+        f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py b/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000000000000000000000000000000000000..93da1e5870561363fb3686e8288ccf561ca72cd2
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39),
+        max_cat_id=40),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py b/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf73b09c8afa9317fa7077f5f67b1fae3306c1b7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py b/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000000000000000000000000000000000000..7121b75bbf0679c55f706ed07294eb2fa3495cc0
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type='PointSample', num_points=20000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..920ac154d68cb07669642300fafd52d179be5392
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e262721b29ede7e29d0d0046eba243f2c82249
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/_base_/default_runtime.py b/model_examples/MapTR/projects/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e85b69abed5f51238da4f183163066073664350
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/model_examples/MapTR/projects/configs/_base_/models/3dssd.py b/model_examples/MapTR/projects/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..55344c7ddff660dc0306542d94260efad39f8df7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+    type='SSD3DNet',
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 512, (256, 256)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+        aggregation_channels=(64, 128, 256),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (512, -1)),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    bbox_head=dict(
+        type='SSD3DHead',
+        in_channels=256,
+        vote_module_cfg=dict(
+            in_channels=256,
+            num_points=256,
+            gt_per_seed=1,
+            conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            with_res_feat=False,
+            vote_xyz_range=(3.0, 3.0, 2.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModuleMSG',
+            num_point=256,
+            radii=(4.8, 6.4),
+            sample_nums=(16, 32),
+            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+            use_xyz=True,
+            normalize_xyz=False,
+            bias=True),
+        pred_layer_cfg=dict(
+            in_channels=1536,
+            shared_conv_channels=(512, 128),
+            cls_conv_channels=(128, ),
+            reg_conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        corner_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+    test_cfg=dict(
+        nms_cfg=dict(type='nms', iou_thr=0.1),
+        sample_mod='spec',
+        score_thr=0.0,
+        per_class_proposal=True,
+        max_output_num=100))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb9e0a8f06d3f597e90156efc9f30264c678fe85
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_num=2000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..efdce59c6d59c6564c6558a7a800852fe14314d7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..311d76373bd261ed8827409be68db0e577b38327
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py b/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ea90760519d6205d75af6a39f927503de89aad
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+    type='FCOSMono3D',
+    pretrained='open-mmlab://detectron2/resnet101_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSMono3DHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py b/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..077d049662fe16b91639af4a5923a4e8e540148d
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+    type='GroupFree3DNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='GroupFree3DHead',
+        in_channels=288,
+        num_decoder_layers=6,
+        num_proposal=256,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=dict(
+                type='GroupFree3DMHA',
+                embed_dims=288,
+                num_heads=8,
+                attn_drop=0.1,
+                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+            ffn_cfgs=dict(
+                embed_dims=288,
+                feedforward_channels=2048,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)),
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')),
+        pred_layer_cfg=dict(
+            in_channels=288, shared_conv_channels=(288, 288), bias=True),
+        sampling_objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(sample_mod='kps'),
+    test_cfg=dict(
+        sample_mod='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last'))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py b/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..760566744f6484cde261f87f0d95a1182786779c
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=2,
+    num_classes=18,
+    primitive_mode='z',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=1,
+    num_classes=18,
+    primitive_mode='xy',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_line_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=0,
+    num_classes=18,
+    primitive_mode='line',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+model = dict(
+    type='H3DNet',
+    backbone=dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(2048, 1024, 512, 256),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d'),
+            sa_cfg=dict(
+                type='PointSAModule',
+                pool_mod='max',
+                use_xyz=True,
+                normalize_xyz=True))),
+    rpn_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    roi_head=dict(
+        type='H3DRoIHead',
+        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+        bbox_head=dict(
+            type='H3DBboxHead',
+            gt_per_seed=3,
+            num_proposal=256,
+            suface_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 6,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 6, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            line_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 12,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 12, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            feat_channels=(128, 128),
+            primitive_refine_channels=[128, 128, 128],
+            upper_thresh=100.0,
+            surface_thresh=0.5,
+            line_thresh=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            dir_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            size_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            semantic_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            cues_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            cues_semantic_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            proposal_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='none',
+                loss_weight=5.0),
+            primitive_center_loss=dict(
+                type='MSELoss', reduction='none', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+        rpn_proposal=dict(use_nms=False),
+        rcnn=dict(
+            pos_distance_thr=0.3,
+            neg_distance_thr=0.6,
+            sample_mod='vote',
+            far_threshold=0.6,
+            near_threshold=0.3,
+            mask_surface_threshold=0.3,
+            label_surface_threshold=0.3,
+            mask_line_threshold=0.3,
+            label_line_threshold=0.3)),
+    test_cfg=dict(
+        rpn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True,
+            use_nms=False),
+        rcnn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c7fe0c6145f0cceadafd7f51c98f209538796d
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-80, -80, -5, 80, 80, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+    pts_middle_encoder=dict(output_shape=[640, 640]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153f6c6e69171d29f79b627dd6d152a842d0db2
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.2,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd200f3e4c0dfb7da1823263b22bbcd63d77d63
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-100, -100, -5, 100, 100, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..85076d0798bc49e1564d6eabe177d1ae92be0aef
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=32,  # max_points_per_voxel
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..14873ead474761d96b8487d48765bf2486277bed
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        voxel_size=voxel_size,
+        max_voxels=(32000, 32000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[1, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+                    [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 1.81, 1.77],  # cyclist
+                [0.84, 0.91, 1.74]  # pedestrian
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf18abe1df08680cc2bb86dfb7b445af4d63ec8
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,
+        point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb9bd3ae5cd6c94e56aa9d88765746853ca58f3e
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=10,
+        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+        voxel_size=voxel_size,
+        max_voxels=(80000, 90000)),
+    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
+                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 0.91, 1.74],  # pedestrian
+                [0.84, 1.81, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            dict(  # pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            dict(  # cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=4096,
+        nms_thr=0.25,
+        score_thr=0.1,
+        min_bbox_size=0,
+        max_num=500))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py b/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..981f8bc9be90a3c2d0ff1edfef3cb3ce91d20d41
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+    type='ImVoteNet',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    img_rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    img_roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        img_rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        img_rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        img_rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py b/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d5e32b0427cf29b7240b26c7f506c283ae6c04
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f513bd4a2f94964f70dba926ef03b427a795e417
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+    backbone=dict(
+        sa_cfg=dict(
+            type='PAConvCUDASAModule',
+            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f1ed39373b40e0871bc97dafaf664ff68594d
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=9,  # [xyz, rgb, normalized_xyz]
+        num_points=(1024, 256, 64, 16),
+        radius=(None, None, None, None),  # use kNN instead of ball query
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d', momentum=0.1),
+        sa_cfg=dict(
+            type='PAConvSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False,
+            paconv_num_kernels=[16, 16, 16],
+            paconv_kernel_input='w_neighbor',
+            scorenet_input='w_neighbor_dist',
+            scorenet_cfg=dict(
+                mlp_channels=[16, 16, 16],
+                score_norm='softmax',
+                temp_factor=1.0,
+                last_bn=False))),
+    decode_head=dict(
+        type='PAConvHead',
+        # PAConv model's decoder takes skip connections from beckbone
+        # different from PointNet++, it also concats input features in the last
+        # level of decoder, leading to `128 + 6` as the channel number
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128 + 6, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # correlation loss to regularize PAConv's kernel weights
+    loss_regularization=dict(
+        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/parta2.py b/model_examples/MapTR/projects/configs/_base_/models/parta2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c5ae9a66372c404923b21f5ee37dfcacd7347ec
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='PartA2',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000000000000000000000000000000000000..222ab885557984125eb52a934f443870e6c6918d
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+                     (128, 128, 128, 128))))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b4c243ded042612abb1c15c9c175f5e932af38
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radius=(0.1, 0.2, 0.4, 0.8),
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        type='PointNet2Head',
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/model_examples/MapTR/projects/configs/_base_/models/votenet.py b/model_examples/MapTR/projects/configs/_base_/models/votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..129339dc9eaa3f74c0547a39fa527c14be03743c
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+    test_cfg=dict(
+        sample_mod='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py b/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000000000000000000000000000000000000..69cb7df87d23846ea7b64fb6d882679e315e55cf
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..704740ee5676515213fd30839f5e116c0b4ebfc7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a711acf4f31cca94ea7a10d035282a45f648c9c
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py b/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b3783cbbe93b6c32bc415dc50f633dffa4aec7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..afde799d9de1e9c03587b54458938b63b1f7de41
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..115cd26b760e749b3ccdd50a6f4d201ea38f824e
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b44e51de071dc9158e31fe7c51420326f0493c
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a49484c8b37d3c44b7a2979a3173af6a407b967
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..975a8f9ff8e5140b0f1707490c282998666c71ef
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..78873d446f8e5f957f89126a2426c5508c7b38a7
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_base.py
@@ -0,0 +1,257 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..6856e7cd46f3c22fbcb2d9e6364ffe1288b8ff3f
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_small.py
@@ -0,0 +1,268 @@
+# BEvFormer-small consumes at lease 10500M GPU memory
+# compared to bevformer_base, bevformer_small has
+# smaller BEV: 200*200 -> 150*150
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> (1600*900)*0.8
+# multi-scale feautres -> single scale features (C5)
+# with_cp of backbone = True
+
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 150
+bev_w_ = 150
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        with_cp=True, # using checkpoint to save GPU memory
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    # dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py b/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56d1b2f4dcb6e80440d5308b52047109959ccaa
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/bevformer/bevformer_tiny.py
@@ -0,0 +1,270 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py b/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0eb269048ba786cddb18f8d01a188681ac09c4
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/bevformer_fp16/bevformer_tiny_fp16.py
@@ -0,0 +1,272 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer_fp16',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2.8e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
+custom_hooks = [dict(type='TransferWeight',priority='LOWEST')]
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a95d898c91e463b731a08f7c52b8186e99da83a
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..af81f9b20d182222d0b69fc26fe32c1e66905a16
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py b/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4100e13546badb06e69fd0b1ed20158de8acf893
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1920, 1280),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py b/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py
new file mode 100644
index 0000000000000000000000000000000000000000..aba45e30a9a2f40de1ca6137e8f845dd87edaa3e
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_nano_r18_110e.py
@@ -0,0 +1,312 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 80
+bev_w_ = 40
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet18-f37072fd.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=100,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_,
+                                im2col_step=192),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=2,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=4,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1,
+                            im2col_step=192),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.2]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.2]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=24,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=4e-3,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=50, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 110
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=5)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1fdb8c6e20f3b91ce1eaf65edd348a7df16f492
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_fusion_24e.py
@@ -0,0 +1,342 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+lidar_point_cloud_range = [-15.0, -30.0, -5.0, 15.0, 30.0, 3.0]
+voxel_size = [0.1, 0.1, 0.2]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=True,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    modality='fusion',
+    lidar_encoder=dict(
+        voxelize=dict(max_num_points=10,point_cloud_range=lidar_point_cloud_range,voxel_size=voxel_size,max_voxels=[90000, 120000]),
+        backbone=dict(
+            type='SparseEncoder',
+            in_channels=5,
+            sparse_shape=[300, 600, 41],
+            output_channels=128,
+            order=('conv', 'norm', 'act'),
+            encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                        128)),
+            encoder_paddings=([0, 0, 1], [0, 0, 1], [0, 0, [1, 1, 0]], [0, 0]),
+            block_type='basicblock'
+        ),
+    ),
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            modality='fusion',
+            fuser=dict(
+                type='ConvFuser',
+                in_channels=[_dim_, 256],
+                out_channels=_dim_,
+            ),
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+reduce_beams=32
+load_dim=5
+use_dim=5
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='CustomLoadPointsFromFile', coord_type='LIDAR', load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams),
+    dict(type='CustomLoadPointsFromMultiSweeps', sweeps_num=9, load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams, pad_empty_sweeps=True, remove_close=True),
+    dict(type='CustomPointsRangeFilter', point_cloud_range=lidar_point_cloud_range),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'points'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='CustomLoadPointsFromFile', coord_type='LIDAR', load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams),
+    dict(type='CustomLoadPointsFromMultiSweeps', sweeps_num=9, load_dim=load_dim, use_dim=use_dim, reduce_beams=reduce_beams, pad_empty_sweeps=True, remove_close=True),
+    dict(type='CustomPointsRangeFilter', point_cloud_range=lidar_point_cloud_range),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img', 'points'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
+find_unused_parameters=True
+
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py
new file mode 100644
index 0000000000000000000000000000000000000000..9457c677aca7d378d089f04c3003658ff2af08d6
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_110e.py
@@ -0,0 +1,310 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 110
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=5)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcded904a2d7b72e7d9e283f6cd6d2e39efa591
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e.py
@@ -0,0 +1,310 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..12976bdb5a2c1118f89fb2a31d22806dc3c1a16a
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py
@@ -0,0 +1,308 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v5', #3
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa65a4cbba579ac465473169b6e442cdd97b32b5
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevformer_t4.py
@@ -0,0 +1,309 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 4 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            len_can_bus=6,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py
new file mode 100644
index 0000000000000000000000000000000000000000..189d67f06f7aaddc3c7523324b2f4f4ff8e52017
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_bevpool.py
@@ -0,0 +1,290 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0,-10.0, 15.0, 30.0, 10.0]
+voxel_size = [0.15, 0.15, 20.0]
+dbound=[1.0, 35.0, 0.5]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='LSSTransform',
+                in_channels=_dim_,
+                out_channels=_dim_,
+                feat_down_sample=32,
+                pc_range=point_cloud_range,
+                voxel_size=voxel_size,
+                dbound=dbound,
+                downsample=2),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd92fc86b26bfe4c08a77b69ff66c0ef93b818bc
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_24e_t4.py
@@ -0,0 +1,308 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 200
+bev_w_ = 100
+queue_length = 4 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            len_can_bus=6,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesLocalMapDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb0958574d19aacd6232a60433cb68197eb9853
--- /dev/null
+++ b/model_examples/MapTR/projects/configs/maptr/maptr_tiny_r50_av2_24e.py
@@ -0,0 +1,315 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-30.0, -15.0, -2.0, 30.0, 15.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing','boundary']
+# fixed_ptsnum_per_line = 20
+# map_classes = ['divider',]
+fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+fixed_ptsnum_per_pred_line = 20
+eval_use_same_gt_sample_num_flag=True
+num_map_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+# bev_h_ = 50
+# bev_w_ = 50
+bev_h_ = 100
+bev_w_ = 200
+queue_length = 1 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='MapTR',
+    use_grid_mask=True,
+    video_test_mode=False,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='MapTRHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_vec=50,
+        num_pts_per_vec=fixed_ptsnum_per_pred_line, # one bbox
+        num_pts_per_gt_vec=fixed_ptsnum_per_gt_line,
+        dir_interval=1,
+        query_embed_type='instance_pts',
+        transform_method='minmax',
+        gt_shift_pts_pattern='v2',
+        num_classes=num_map_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        code_size=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='MapTRPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            num_cams=7,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=1,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='GeometrySptialCrossAttention',
+                            pc_range=point_cloud_range,
+                            num_cams=7,
+                            attention=dict(
+                                type='GeometryKernelAttention',
+                                embed_dims=_dim_,
+                                num_heads=4,
+                                dilation=1,
+                                kernel_size=(3,5),
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='MapTRDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='MapTRNMSFreeCoder',
+            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            post_center_range=[-35, -20, -35, -20, 35, 20, 35, 20],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=num_map_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_pts=dict(type='PtsL1Loss', 
+                      loss_weight=5.0),
+        loss_dir=dict(type='PtsDirCosLoss', loss_weight=0.005)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='MapTRAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            # reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            # iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', 
+                      weight=5),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomAV2LocalMapDataset'
+data_root = 'data/argoverse2/sensor/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True, padding=True),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.3]),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['img'])
+]
+
+test_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True, padding=True),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.3]),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(2048, 2048), # 2048*0.3, 2048*0.3
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'av2_map_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+        padding_value=-10000,
+        map_classes=map_classes,
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'av2_map_infos_val.pkl',
+            #  ann_file=data_root + 'av2_map_infos_train.pkl',
+             map_ann_file=data_root + 'av2_map_anns_val.json',
+             load_interval=4, # av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
+            #  load_interval=1, # TODO debug
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             pc_range=point_cloud_range,
+             fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+             eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+             padding_value=-10000,
+             map_classes=map_classes,
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'av2_map_infos_val.pkl',
+            #   ann_file=data_root + 'av2_map_infos_train.pkl',
+              map_ann_file=data_root + 'av2_map_anns_val.json',
+              load_interval=4, # av2 uses 10 Hz, set to 5, 2HZ the same as nuscenes,
+            #   load_interval=1, # TODO debug
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              pc_range=point_cloud_range,
+              fixed_ptsnum_per_line=fixed_ptsnum_per_gt_line,
+              eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,
+              padding_value=-10000,
+              map_classes=map_classes,
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=6e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+# total_epochs = 50
+# evaluation = dict(interval=1, pipeline=test_pipeline)
+evaluation = dict(interval=2, pipeline=test_pipeline, metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e32add3335e5f1f7e007457ff740bfcf1c91d5
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/__init__.py
@@ -0,0 +1,13 @@
+from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D
+from .core.bbox.coders.nms_free_coder import NMSFreeCoder
+from .core.bbox.match_costs import BBox3DL1Cost
+from .core.evaluation.eval_hooks import CustomDistEvalHook
+from .datasets.pipelines import (
+  PhotoMetricDistortionMultiViewImage, PadMultiViewImage, 
+  NormalizeMultiviewImage,  CustomCollect3D)
+from .models.backbones.vovnet import VoVNet
+from .models.utils import *
+from .models.opt.adamw import AdamW2
+from .bevformer import *
+from .maptr import *
+from .models.backbones.efficientnet import EfficientNet
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27df18f846ee77954d3aa11c4c4613aabeea06d4
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py
@@ -0,0 +1,6 @@
+
+from .dense_heads import *
+from .detectors import *
+from .modules import *
+from .runner import *
+from .hooks import *
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e57bd225dc33d631849a3aef8db2bae217520658
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
@@ -0,0 +1,200 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
+                         build_runner, get_dist_info)
+from mmcv.utils import build_from_cfg
+
+from mmdet.core import EvalHook
+
+from mmdet.datasets import (build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import get_root_logger
+import time
+import os.path as osp
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = MMDistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = MMDataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..03acb14afe79cbec4f0463cd4865fcc7dd07a50f
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py
@@ -0,0 +1,164 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return bbox_results
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9391e606f29961875b48eebe36d3b9d415b6290
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py
@@ -0,0 +1,67 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+from mmseg.apis import train_segmentor
+from mmdet.apis import train_detector
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6823adfb593d67f27af4af2207a515af4cbab6f5
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
@@ -0,0 +1 @@
+from .bevformer_head import BEVFormerHead
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..15691fd9040c07fad2c04991d398ff586996597c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
@@ -0,0 +1,523 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear, bias_init_with_prob
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from mmdet.core import (multi_apply, multi_apply, reduce_mean)
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.models import HEADS
+from mmdet.models.dense_heads import DETRHead
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from mmcv.cnn.bricks.transformer import build_positional_encoding
+from mmcv.runner import force_fp32, auto_fp16
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+import numpy as np
+import mmcv
+import cv2 as cv
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+
+
+@HEADS.register_module()
+class BEVFormerHead(DETRHead):
+    """Head of Detr3D.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+        super(BEVFormerHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    @auto_fp16(apply_to=('mlvl_feats'))
+    def forward(self, mlvl_feats, img_metas, prev_bev=None,  only_bev=False):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references = outputs
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+
+        return outs
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan,
+                                                               :10], bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
+                                           losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+
+            ret_list.append([bboxes, scores, labels])
+
+        return ret_list
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78146f0642d52d604a433ae2eab6866fee23ab3e
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
@@ -0,0 +1,2 @@
+from .bevformer import BEVFormer
+from .bevformer_fp16 import BEVFormer_fp16
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1e0743dc994529dbf0ef0523e28d616cba6236
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
@@ -0,0 +1,289 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from tkinter.messagebox import NO
+import torch
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+import time
+import copy
+import numpy as np
+import mmdet3d
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer(MVXTwoStageDetector):
+    """BEVFormer.
+    Args:
+        video_test_mode (bool): Decide whether to use temporal information during inference.
+    """
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False
+                 ):
+
+        super(BEVFormer,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'))
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          prev_bev=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(
+            pts_feats, img_metas, prev_bev)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                # img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs)
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+        return bbox_results
+
+    def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False):
+        """Test function"""
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev)
+
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return outs['bev_embed'], bbox_results
+
+    def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts = self.simple_test_pts(
+            img_feats, img_metas, prev_bev, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return new_prev_bev, bbox_list
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0628168b4c5f0143d159d0f606d87cc5ee0cf5
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
@@ -0,0 +1,89 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from tkinter.messagebox import NO
+import torch
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer
+import time
+import copy
+import numpy as np
+import mmdet3d
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer_fp16(BEVFormer):
+    """
+    The default version BEVFormer currently can not support FP16. 
+    We provide this version to resolve this issue.
+    """
+    
+    @auto_fp16(apply_to=('img', 'prev_bev', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      prev_bev=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev=prev_bev)
+        losses.update(losses_pts)
+        return losses
+
+
+    def val_step(self, data, optimizer):
+        """
+        In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`.
+        This is not the standard function of `val_step`.
+        """
+
+        img = data['img']
+        img_metas = data['img_metas']
+        img_feats = self.extract_feat(img=img,  img_metas=img_metas)
+        prev_bev = data.get('prev_bev', None)
+        prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True)
+        return prev_bev
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa04ec16df5b0bb9f21cadf22f9172c3cc9a58c1
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
@@ -0,0 +1 @@
+from .custom_hooks import TransferWeight
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98ad1cb8d57c41c7145f93037d30e9fb9444265
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
@@ -0,0 +1,14 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class TransferWeight(Hook):
+    
+    def __init__(self, every_n_inters=1):
+        self.every_n_inters=every_n_inters
+
+    def after_train_iter(self, runner):
+        if self.every_n_inner_iters(runner, self.every_n_inters):
+            runner.eval_model.load_state_dict(runner.model.state_dict())
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..09d1e8f30d095ef910cde4a3dff15f1bb8b45252
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py
@@ -0,0 +1,7 @@
+from .transformer import PerceptionTransformer
+from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D, MSIPM3D
+from .temporal_self_attention import TemporalSelfAttention
+from .encoder import BEVFormerEncoder, BEVFormerLayer
+from .decoder import DetectionTransformerDecoder
+
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f6bdeec795e073933fd8834f4bac984764a841d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
@@ -0,0 +1,487 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+
+from mmcv import ConfigDict, deprecated_api_warning
+from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                                      TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention
+
+
+@TRANSFORMER_LAYER.register_module()
+class MyCustomBaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                f'attn_masks {len(attn_masks)} must be equal ' \
+                f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+
+@TRANSFORMER_LAYER.register_module()
+class MyCustomBaseTransformerLayerWithoutSelfAttn(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(MyCustomBaseTransformerLayerWithoutSelfAttn, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all three operation type ' \
+            f"{['norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                f'attn_masks {len(attn_masks)} must be equal ' \
+                f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..236d33c92cfd39f2a5c00e88b2dc90e17425b6e4
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py
@@ -0,0 +1,341 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import mmcv
+import cv2 as cv
+import copy
+import warnings
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+#     MultiScaleDeformableAttnFunction_fp16
+    
+from mx_driving.fused import npu_multi_scale_deformable_attn_function #3
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(DetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 3
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                new_reference_points[..., 2:3] = tmp[
+                    ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@ATTENTION.register_module()
+class CustomMSDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3
+            
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..979e2206d2ba02c6495c74b49385bb93436a6131
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py
@@ -0,0 +1,404 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
+import copy
+import warnings
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmcv.runner import force_fp32, auto_fp16
+import numpy as np
+import torch
+import cv2 as cv
+import mmcv
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class BEVFormerEncoder(TransformerLayerSequence):
+
+    """
+    Attention with both self and cross
+    Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes',
+                 **kwargs):
+
+        super(BEVFormerEncoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+        self.num_points_in_pillar = num_points_in_pillar
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+
+    @staticmethod
+    def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float):
+        """Get the reference points used in SCA and TSA.
+        Args:
+            H, W: spatial shape of bev.
+            Z: hight of pillar.
+            D: sample D points uniformly from each pillar.
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+
+        # reference points in 3D space, used in spatial cross-attention (SCA)
+        if dim == '3d':
+            zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype,
+                                device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z
+            xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype,
+                                device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W
+            ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype,
+                                device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
+            ref_3d = torch.stack((xs, ys, zs), -1)
+            ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
+            ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
+            return ref_3d
+
+        # reference points on 2D bev plane, used in temporal self-attention (TSA).
+        elif dim == '2d':
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=dtype, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=dtype, device=device)
+            )
+            ref_y = ref_y.reshape(-1)[None] / H
+            ref_x = ref_x.reshape(-1)[None] / W
+            ref_2d = torch.stack((ref_x, ref_y), -1)
+            ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2)
+            return ref_2d
+
+    # This function must use fp32!!!
+    @force_fp32(apply_to=('reference_points', 'img_metas'))
+    def point_sampling(self, reference_points, pc_range,  img_metas):
+
+        lidar2img = []
+        for img_meta in img_metas:
+            lidar2img.append(img_meta['lidar2img'])
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = reference_points.new_tensor(lidar2img)  # (B, N, 4, 4)
+        reference_points = reference_points.clone()
+
+        reference_points[..., 0:1] = reference_points[..., 0:1] * \
+            (pc_range[3] - pc_range[0]) + pc_range[0]
+        reference_points[..., 1:2] = reference_points[..., 1:2] * \
+            (pc_range[4] - pc_range[1]) + pc_range[1]
+        reference_points[..., 2:3] = reference_points[..., 2:3] * \
+            (pc_range[5] - pc_range[2]) + pc_range[2]
+
+        reference_points = torch.cat(
+            (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+
+        reference_points = reference_points.permute(1, 0, 2, 3)
+        D, B, num_query = reference_points.size()[:3]
+        num_cam = lidar2img.size(1)
+
+        reference_points = reference_points.view(
+            D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1)
+
+        lidar2img = lidar2img.view(
+            1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1)
+
+        reference_points_cam = torch.matmul(lidar2img.to(torch.float32),
+                                            reference_points.to(torch.float32)).squeeze(-1)
+        eps = 1e-5
+
+        bev_mask = (reference_points_cam[..., 2:3] > eps)
+        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
+            reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps)
+
+        reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
+        reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]
+
+        bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0)
+                    & (reference_points_cam[..., 1:2] < 1.0)
+                    & (reference_points_cam[..., 0:1] < 1.0)
+                    & (reference_points_cam[..., 0:1] > 0.0))
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            bev_mask = torch.nan_to_num(bev_mask)
+        else:
+            bev_mask = bev_mask.new_tensor(
+                np.nan_to_num(bev_mask.cpu().numpy()))
+
+        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
+        bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1)
+
+        return reference_points_cam, bev_mask
+
+    @auto_fp16()
+    def forward(self,
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_h=None,
+                bev_w=None,
+                bev_pos=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                valid_ratios=None,
+                prev_bev=None,
+                shift=0.,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            bev_query (Tensor): Input BEV query with shape
+                `(num_query, bs, embed_dims)`.
+            key & value (Tensor): Input multi-cameta features with shape
+                (num_cam, num_value, bs, embed_dims)
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+
+        output = bev_query
+        intermediate = []
+
+        ref_3d = self.get_reference_points(
+            bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1),  device=bev_query.device, dtype=bev_query.dtype)
+        ref_2d = self.get_reference_points(
+            bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype)
+
+        reference_points_cam, bev_mask = self.point_sampling(
+            ref_3d, self.pc_range, kwargs['img_metas'])
+
+        # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
+        # shift_ref_2d = ref_2d  # .clone()
+        shift_ref_2d = ref_2d.clone()
+        shift_ref_2d += shift[:, None, None, :]
+
+        # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
+        bev_query = bev_query.permute(1, 0, 2)
+        bev_pos = bev_pos.permute(1, 0, 2)
+        bs, len_bev, num_bev_level, _ = ref_2d.shape
+        if prev_bev is not None:
+            prev_bev = prev_bev.permute(1, 0, 2)
+            prev_bev = torch.stack(
+                [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1)
+            hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+        else:
+            hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+
+        for lid, layer in enumerate(self.layers):
+            output = layer(
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_pos=bev_pos,
+                ref_2d=hybird_ref_2d,
+                ref_3d=ref_3d,
+                bev_h=bev_h,
+                bev_w=bev_w,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                reference_points_cam=reference_points_cam,
+                bev_mask=bev_mask,
+                prev_bev=prev_bev,
+                **kwargs)
+
+            bev_query = output
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output
+
+
+@TRANSFORMER_LAYER.register_module()
+class BEVFormerLayer(MyCustomBaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(BEVFormerLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        self.fp16_enabled = False
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                bev_pos=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                ref_2d=None,
+                ref_3d=None,
+                bev_h=None,
+                bev_w=None,
+                reference_points_cam=None,
+                mask=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            # temporal self attention
+            if layer == 'self_attn':
+
+                query = self.attentions[attn_index](
+                    query,
+                    prev_bev,
+                    prev_bev,
+                    identity if self.pre_norm else None,
+                    query_pos=bev_pos,
+                    key_pos=bev_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    reference_points=ref_2d,
+                    spatial_shapes=torch.tensor(
+                        [[bev_h, bev_w]], device=query.device),
+                    level_start_index=torch.tensor([0], device=query.device),
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            # spaital cross attention
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    reference_points=ref_3d,
+                    reference_points_cam=reference_points_cam,
+                    mask=mask,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f917343b880994670312d57466c576a8f05d97
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
@@ -0,0 +1,163 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.autograd.function import Function, once_differentiable
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction_fp16(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MultiScaleDeformableAttnFunction_fp32(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b45612f4f348b5e427c20fe1ffd4c68d3bb396
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py
@@ -0,0 +1,615 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import build_attention
+import math
+from mmcv.runner import force_fp32, auto_fp16
+
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.utils import ext_loader
+# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+#     MultiScaleDeformableAttnFunction_fp16
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+from mx_driving.fused import npu_multi_scale_deformable_attn_function #3
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+    
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+        
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):   
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
+
+
+
+@ATTENTION.register_module()
+class MSIPM3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        # self.sampling_offsets = nn.Linear(
+        #     embed_dims, num_heads * num_levels * num_points * 2)
+        # self.attention_weights = nn.Linear(embed_dims,
+        #                                    num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        # constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        # self.sampling_offsets.bias.data = grid_init.view(-1)
+        self.fixed_sampling_offsets = nn.Parameter(grid_init.view(-1), requires_grad=False)
+        # constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.fixed_sampling_offsets.view(
+            1, 1, self.num_heads, self.num_levels, self.num_points, 2).repeat(
+            bs, num_query, 1, 1, 1,1)
+        # attention_weights = self.attention_weights(query).view(
+        #     bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = query.new_ones((bs, num_query, self.num_heads, self.num_levels * self.num_points))
+        attention_weights = attention_weights.softmax(-1)
+        # import pdb;pdb.set_trace()
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..094f46a5c1f34db77d453e30af29a09d84fe9867
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py
@@ -0,0 +1,267 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+# from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import ATTENTION
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+from mx_driving.fused import npu_multi_scale_deformable_attn_function #3
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
+                                           num_bev_queue*num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
+
+            # value = torch.cat([query, query], 0)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs,  num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[:bs], query], -1)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs*self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            output = npu_multi_scale_deformable_attn_function(value, spatial_shapes, level_start_index, sampling_locations, attention_weights) #3
+            
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f86127627ef764316348b09688a3e0ea3b38ae4
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/modules/transformer.py
@@ -0,0 +1,289 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmcv.runner.base_module import BaseModule
+
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.runner.base_module import BaseModule
+from torchvision.transforms.functional import rotate
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from mmcv.runner import force_fp32, auto_fp16
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(PerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/__init__.py
@@ -0,0 +1 @@
+from .epoch_based_runner import EpochBasedRunner_video
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d167e2e1c63beed011e75bedb05f43817fe797b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import os.path as osp
+import torch
+import mmcv
+from mmcv.runner.base_runner import BaseRunner
+from mmcv.runner.epoch_based_runner import EpochBasedRunner
+from mmcv.runner.builder import RUNNERS
+from mmcv.runner.checkpoint import save_checkpoint
+from mmcv.runner.utils import get_host_info
+from pprint import pprint
+from mmcv.parallel.data_container import DataContainer
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner_video(EpochBasedRunner):
+    
+    ''' 
+    # basic logic
+    
+    input_sequence = [a, b, c] # given a sequence of samples
+    
+    prev_bev = None
+    for each in input_sequcene[:-1]
+        prev_bev = eval_model(each, prev_bev)) # inference only.
+    
+    model(input_sequcene[-1], prev_bev) # train the last sample.
+    '''
+    
+    def __init__(self,
+                 model,
+                 eval_model=None,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                 max_iters=None,
+                 max_epochs=None):
+        super().__init__(model,
+                 batch_processor,
+                 optimizer,
+                 work_dir,
+                 logger,
+                 meta,
+                 max_iters,
+                 max_epochs)
+        keys.append('img_metas')
+        self.keys = keys
+        self.eval_model = eval_model
+        self.eval_model.eval()
+    
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            assert False
+            # outputs = self.batch_processor(
+            #     self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+
+            num_samples = data_batch['img'].data[0].size(1)
+            data_list = []
+            prev_bev = None
+            for i in range(num_samples):
+                data = {}
+                for key in self.keys:
+                    if key not in ['img_metas', 'img', 'points']:
+                        data[key] = data_batch[key]
+                    else:
+                        if key == 'img':
+                            data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True)
+                        elif key == 'img_metas':
+                            data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only)
+                        else:
+                            assert False
+                data_list.append(data)
+            with torch.no_grad():
+                for i in range(num_samples-1):
+                    if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+                    prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs)
+            
+            data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+            outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs)
+        else:
+            assert False
+            # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23156d8d589a4102824d650e76cd1d0ecba3ff49
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
@@ -0,0 +1,3 @@
+from .hungarian_assigner_3d import HungarianAssigner3D
+
+__all__ = ['HungarianAssigner3D']
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..583fcab72f6b2bbf20bda90b8f877cc1f81072d9
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
@@ -0,0 +1,136 @@
+import torch
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+from mmdet.models.utils.transformer import inverse_sigmoid
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+       
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+    
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+      
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2be78ce5389f62d716cc5f1aded821641aa88dea
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
@@ -0,0 +1,3 @@
+from .nms_free_coder import NMSFreeCoder, MapTRNMSFreeCoder
+
+__all__ = ['NMSFreeCoder', 'MapTRNMSFreeCoder']
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfcdae8cfa213b5d87f9dbbb8751b2cdf90e7c3a
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
@@ -0,0 +1,283 @@
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
+import numpy as np
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy
+
+
+def normalize_2d_bbox(bboxes, pc_range):
+
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes)
+    cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0]
+    cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1]
+    factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h])
+
+    normalized_bboxes = cxcywh_bboxes / factor
+    return normalized_bboxes
+
+def normalize_2d_pts(pts, pc_range):
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    new_pts = pts.clone()
+    new_pts[...,0:1] = pts[..., 0:1] - pc_range[0]
+    new_pts[...,1:2] = pts[...,1:2] - pc_range[1]
+    factor = pts.new_tensor([patch_w, patch_h])
+    normalized_pts = new_pts / factor
+    return normalized_pts
+
+def denormalize_2d_bbox(bboxes, pc_range):
+
+    bboxes = bbox_cxcywh_to_xyxy(bboxes)
+    bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+
+    return bboxes
+def denormalize_2d_pts(pts, pc_range):
+    new_pts = pts.clone()
+    new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+    return new_pts
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
+
+
+@BBOX_CODERS.register_module()
+class MapTRNMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, pts_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+            pts_preds (Tensor):
+                Shape [num_query, fixed_num_pts, 2]
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        pts_preds = pts_preds[bbox_index]
+       
+        final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range) 
+        final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2
+        # final_box_preds = bbox_preds 
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :4] >=
+                    self.post_center_range[:4]).all(1)
+            mask &= (final_box_preds[..., :4] <=
+                     self.post_center_range[4:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            pts = final_pts_preds[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels,
+                'pts': pts,
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        all_pts_preds = preds_dicts['all_pts_preds'][-1]
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i]))
+        return predictions_list
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac1a82a64f467a47e39d7e862357459e84abb84
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
@@ -0,0 +1,4 @@
+from mmdet.core.bbox.match_costs import build_match_cost
+from .match_cost import BBox3DL1Cost
+
+__all__ = ['build_match_cost', 'BBox3DL1Cost']
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9678f3c7f666255540762d4064f0f7d82b920ed
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,27 @@
+import torch
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c54bd750246f3d6e2249b7d39888fffa6227beda
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/bbox/util.py
@@ -0,0 +1,53 @@
+import torch 
+
+
+def normalize_bbox(bboxes, pc_range):
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8] 
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
+        )
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
+        )
+    return normalized_bboxes
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation 
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+   
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp() 
+    l = l.exp() 
+    h = h.exp() 
+    if normalized_bboxes.size(-1) > 8:
+         # velocity 
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92421c7e84fdc7a33e94aa10fddfccb332d6399
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/__init__.py
@@ -0,0 +1 @@
+from .eval_hooks import CustomDistEvalHook
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..0477213bd3fe5874c7fe8c7c4fe2d861165e2d58
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
@@ -0,0 +1,91 @@
+
+# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
+# in order to avoid strong version dependency, we did not directly
+# inherit EvalHook but BaseDistEvalHook.
+
+import bisect
+import os.path as osp
+
+import mmcv
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmdet.core.evaluation.eval_hooks import DistEvalHook
+
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert mmcv.is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None,  **kwargs):
+        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test # to solve circlur  import
+
+        results = custom_multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
+  
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f816974544b57c1561a1fc09b9cf9e48dde03e38
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+    import mmcv
+    import numpy as np
+    import tensorflow as tf
+    from glob import glob
+    from os.path import join
+    from waymo_open_dataset import label_pb2
+    from waymo_open_dataset.protos import metrics_pb2
+except ImportError:
+    #pass
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+
+
+
+class KITTI2Waymo(object):
+    """KITTI predictions to Waymo converter.
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+    Args:
+        kitti_result_files (list[dict]): Predictions in KITTI format.
+        waymo_tfrecords_dir (str): Directory to load waymo raw data.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of parallel processes.
+    """
+
+    def __init__(self,
+                 kitti_result_files,
+                 waymo_tfrecords_dir,
+                 waymo_results_save_dir,
+                 waymo_results_final_path,
+                 prefix,
+                 workers=64):
+
+        self.kitti_result_files = kitti_result_files
+        self.waymo_tfrecords_dir = waymo_tfrecords_dir
+        self.waymo_results_save_dir = waymo_results_save_dir
+        self.waymo_results_final_path = waymo_results_final_path
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.name2idx = {}
+        for idx, result in enumerate(kitti_result_files):
+            if len(result['sample_idx']) > 0:
+                self.name2idx[str(result['sample_idx'][0])] = idx
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+                                            [-1.0, 0.0, 0.0, 0.0],
+                                            [0.0, -1.0, 0.0, 0.0],
+                                            [0.0, 0.0, 0.0, 1.0]])
+
+        self.get_file_names()
+        self.create_folder()
+
+    def get_file_names(self):
+        """Get file names of waymo raw data."""
+        self.waymo_tfrecord_pathnames = sorted(
+            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+    def create_folder(self):
+        """Create folder for data conversion."""
+        mmcv.mkdir_or_exist(self.waymo_results_save_dir)
+
+    def parse_objects(self, kitti_result, T_k2w, context_name,
+                      frame_timestamp_micros):
+        """Parse one prediction with several instances in kitti format and
+        convert them to `Object` proto.
+        Args:
+            kitti_result (dict): Predictions in kitti format.
+                - name (np.ndarray): Class labels of predictions.
+                - dimensions (np.ndarray): Height, width, length of boxes.
+                - location (np.ndarray): Bottom center of boxes (x, y, z).
+                - rotation_y (np.ndarray): Orientation of boxes.
+                - score (np.ndarray): Scores of predictions.
+            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+            context_name (str): Context name of the frame.
+            frame_timestamp_micros (int): Frame timestamp.
+        Returns:
+            :obj:`Object`: Predictions in waymo dataset Object proto.
+        """
+
+        def parse_one_object(instance_idx):
+            """Parse one instance in kitti format and convert them to `Object`
+            proto.
+            Args:
+                instance_idx (int): Index of the instance to be converted.
+            Returns:
+                :obj:`Object`: Predicted instance in waymo dataset \
+                    Object proto.
+            """
+            cls = kitti_result['name'][instance_idx]
+            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+            x = round(kitti_result['location'][instance_idx, 0], 4)
+            y = round(kitti_result['location'][instance_idx, 1], 4)
+            z = round(kitti_result['location'][instance_idx, 2], 4)
+            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+            score = round(kitti_result['score'][instance_idx], 4)
+
+            # y: downwards; move box origin from bottom center (kitti) to
+            # true center (waymo)
+            y -= height / 2
+            # frame transformation: kitti -> waymo
+            x, y, z = self.transform(T_k2w, x, y, z)
+
+            # different conventions
+            heading = -(rotation_y + np.pi / 2)
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box = label_pb2.Label.Box()
+            box.center_x = x
+            box.center_y = y
+            box.center_z = z
+            box.length = length
+            box.width = width
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[cls]
+            o.score = score
+
+            o.context_name = context_name
+            o.frame_timestamp_micros = frame_timestamp_micros
+
+            return o
+
+        objects = metrics_pb2.Objects()
+
+        for instance_idx in range(len(kitti_result['name'])):
+            o = parse_one_object(instance_idx)
+            objects.objects.append(o)
+
+        return objects
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+        for frame_num, frame_data in enumerate(file_data):
+            frame = open_dataset.Frame()
+            frame.ParseFromString(bytearray(frame_data.numpy()))
+            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+            for camera in frame.context.camera_calibrations:
+                # FRONT = 1, see dataset.proto for details
+                if camera.name == 1:
+                    T_front_cam_to_vehicle = np.array(
+                        camera.extrinsic.transform).reshape(4, 4)
+
+            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                kitti_result = \
+                    self.kitti_result_files[self.name2idx[filename]]
+                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                                             frame_timestamp_micros)
+            else:
+                print(filename, 'not found.(bevformer)')
+                objects = metrics_pb2.Objects()
+
+            with open(
+                    join(self.waymo_results_save_dir, f'{filename}.bin'),
+                    'wb') as f:
+                f.write(objects.SerializeToString())
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+        # combine all files into one .bin
+        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+        combined = self.combine(pathnames)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.waymo_tfrecord_pathnames)
+
+    def transform(self, T, x, y, z):
+        """Transform the coordinates with matrix T.
+        Args:
+            T (np.ndarray): Transformation matrix.
+            x(float): Coordinate in x axis.
+            y(float): Coordinate in y axis.
+            z(float): Coordinate in z axis.
+        Returns:
+            list: Coordinates after transformation.
+        """
+        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+        pt_aft = np.matmul(T, pt_bef)
+        return pt_aft[:3].flatten().tolist()
+
+    def combine(self, pathnames):
+        """Combine predictions in waymo format for each sample together.
+        Args:
+            pathnames (str): Paths to save predictions.
+        Returns:
+            :obj:`Objects`: Combined predictions in Objects proto.
+        """
+        combined = metrics_pb2.Objects()
+
+        for pathname in pathnames:
+            objects = metrics_pb2.Objects()
+            with open(pathname, 'rb') as f:
+                objects.ParseFromString(f.read())
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        return combined
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..df87ab875149e973f14cd2c1fad764f23327d3be
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/__init__.py
@@ -0,0 +1,8 @@
+from .nuscenes_dataset import CustomNuScenesDataset
+from .builder import custom_build_dataset
+
+from .nuscenes_map_dataset import CustomNuScenesLocalMapDataset
+from .av2_map_dataset import CustomAV2LocalMapDataset
+__all__ = [
+    'CustomNuScenesDataset','CustomNuScenesLocalMapDataset'
+]
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd5339c602ad547bfed178ce48ed8e73dd55824b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/av2_map_dataset.py
@@ -0,0 +1,1525 @@
+import copy
+
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+import mmcv
+import os
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+
+from .nuscenes_dataset import CustomNuScenesDataset
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from shapely import affinity, ops
+from shapely.geometry import Polygon, LineString, box, MultiPolygon, MultiLineString
+from mmdet.datasets.pipelines import to_tensor
+import json
+
+from pathlib import Path
+from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader
+from av2.map.lane_segment import LaneMarkType, LaneSegment
+from av2.map.map_api import ArgoverseStaticMap
+from av2.geometry.se3 import SE3
+import av2.geometry.interpolate as interp_utils
+
+
+class LiDARInstanceLines(object):
+    """Line instance in LIDAR coordinates
+
+    """
+    def __init__(self, 
+                 instance_line_list, 
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_num=-1,
+                 padding_value=-10000,
+                 patch_size=None):
+        assert isinstance(instance_line_list, list)
+        assert patch_size is not None
+        if len(instance_line_list) != 0:
+            assert isinstance(instance_line_list[0], LineString)
+        self.patch_size = patch_size
+        self.max_x = self.patch_size[1] / 2
+        self.max_y = self.patch_size[0] / 2
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_num
+        self.padding_value = padding_value
+
+        self.instance_list = instance_line_list
+
+    @property
+    def start_end_points(self):
+        """
+        return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+        """
+        assert len(self.instance_list) != 0
+        instance_se_points_list = []
+        for instance in self.instance_list:
+            se_points = []
+            se_points.extend(instance.coords[0])
+            se_points.extend(instance.coords[-1])
+            instance_se_points_list.append(se_points)
+        instance_se_points_array = np.array(instance_se_points_list)
+        instance_se_points_tensor = to_tensor(instance_se_points_array)
+        instance_se_points_tensor = instance_se_points_tensor.to(
+                                dtype=torch.float32)
+        instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_se_points_tensor
+
+    @property
+    def bbox(self):
+        """
+        return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+        """
+        assert len(self.instance_list) != 0
+        instance_bbox_list = []
+        for instance in self.instance_list:
+            # bounds is bbox: [xmin, ymin, xmax, ymax]
+            instance_bbox_list.append(instance.bounds)
+        instance_bbox_array = np.array(instance_bbox_list)
+        instance_bbox_tensor = to_tensor(instance_bbox_array)
+        instance_bbox_tensor = instance_bbox_tensor.to(
+                            dtype=torch.float32)
+        instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_bbox_tensor
+
+    @property
+    def fixed_num_sampled_points(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # instance_array = np.array(list(instance.coords))
+            # interpolated_instance = interp_utils.interp_arc(t=self.fixed_num, points=instance_array)
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances])
+            if instance.has_z:
+                sampled_points = sampled_points.reshape(-1,3)
+            else:
+                sampled_points = sampled_points.reshape(-1,2)
+            # import pdb;pdb.set_trace()
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_ambiguity(self):
+        """
+        return torch.Tensor([N,fixed_num,3]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            if instance.has_z:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 3)
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        instance_points_tensor = instance_points_tensor if is_3d else instance_points_tensor[:,:,:2]
+        instance_points_tensor = instance_points_tensor.unsqueeze(1)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_torch(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # distances = np.linspace(0, instance.length, self.fixed_num)
+            # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            poly_pts = to_tensor(np.array(list(instance.coords)))
+            poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+            sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+            sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+            instance_points_list.append(sampled_pts)
+        # instance_points_array = np.array(instance_points_list)
+        # instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = torch.stack(instance_points_list,dim=0)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def shift_fixed_num_sampled_points(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,shift_pts.shape[-1]], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v1(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num-shift_pts.shape[0],pts_num,shift_pts.shape[-1]], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v2(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+
+            if shifts_num > final_shift_num:
+                index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+                multi_shifts_pts = multi_shifts_pts[index]
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+                padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,multi_shifts_pts_tensor.shape[-1]], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v3(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num)
+                    shift_pts_list.append(shift_sampled_points)
+                flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, coords_num)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+            # import pdb;pdb.set_trace()
+            if shifts_num > 2*final_shift_num:
+                index = np.random.choice(shift_num, final_shift_num, replace=False)
+                flip0_shifts_pts = multi_shifts_pts[index]
+                flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+                multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+                padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,multi_shifts_pts_tensor.shape[-1]], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v4(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+                flip_pts_to_shift = pts_to_shift.flip(0)
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,shift_pts.shape[-1]], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_torch(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,shift_pts.shape[-1]], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    # @property
+    # def polyline_points(self):
+    #     """
+    #     return [[x0,y0],[x1,y1],...]
+    #     """
+    #     assert len(self.instance_list) != 0
+    #     for instance in self.instance_list:
+
+
+class VectorizedAV2LocalMap(object):
+    CLASS2LABEL = {
+        'road_divider': 0,
+        'lane_divider': 0,
+        'divider': 0,
+        'ped_crossing': 1,
+        'boundary': 2,
+        'others': -1
+    }
+    def __init__(self,
+                 dataroot,
+                 patch_size,
+                 test_mode=False,
+                 map_classes=('divider','ped_crossing','boundary'),
+                 line_classes=('road_divider', 'lane_divider'),
+                 ped_crossing_classes=('ped_crossing'),
+                 contour_classes=('road_segment', 'lane'),
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_ptsnum_per_line=-1,
+                 padding_value=-10000,):
+        '''
+        Args:
+            fixed_ptsnum_per_line = -1 : no fixed num
+        '''
+        super().__init__()
+        # self.data_root = dataroot
+        self.test_mode = test_mode
+        if self.test_mode:
+            self.data_root = osp.join(dataroot, "val")
+        else:
+            self.data_root = osp.join(dataroot, "train")
+        
+        self.loader = AV2SensorDataLoader(data_dir=Path(dataroot), labels_dir=Path(dataroot))
+
+
+
+        self.vec_classes = map_classes
+        self.line_classes = line_classes
+        self.ped_crossing_classes = ped_crossing_classes
+        self.polygon_classes = contour_classes
+
+
+        self.patch_size = patch_size
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_ptsnum_per_line
+        self.padding_value = padding_value
+
+    def gen_vectorized_samples(self, location, map_elements, lidar2global_translation, lidar2global_rotation):
+        '''
+        use lidar2global to get gt map layers
+        av2 lidar2global the same as ego2global
+        location the same as log_id
+        '''
+        # avm = ArgoverseStaticMap.from_map_dir(log_map_dirpath, build_raster=False)
+
+        map_pose = lidar2global_translation[:2]
+        rotation = Quaternion._from_matrix(lidar2global_rotation)
+
+        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        # import pdb;pdb.set_trace()
+        vectors = []
+        city_SE2_ego = SE3(lidar2global_rotation, lidar2global_translation)
+        ego_SE3_city = city_SE2_ego.inverse()
+        for vec_class in self.vec_classes:
+            if vec_class == 'divider':
+                line_geom = self.get_map_divider_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city)
+                line_instances_list = self.line_geoms_to_instances(line_geom)     
+                for divider in line_instances_list:
+                    vectors.append((divider, self.CLASS2LABEL.get('divider', -1)))
+            elif vec_class == 'ped_crossing':
+                ped_geom = self.get_map_ped_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city)
+                ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
+                for instance in ped_instance_list:
+                    vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
+            elif vec_class == 'boundary':
+                polygon_geom = self.get_map_boundary_geom(patch_box, patch_angle, map_elements[vec_class], ego_SE3_city)
+                poly_bound_list = self.bound_poly_geoms_to_instances(polygon_geom)
+                for bound in poly_bound_list:
+                    vectors.append((bound, self.CLASS2LABEL.get('boundary', -1)))
+            else:
+                raise ValueError(f'WRONG vec_class: {vec_class}')
+
+        # filter out -1
+        filtered_vectors = []
+        gt_pts_loc_3d = []
+        gt_pts_num_3d = []
+        gt_labels = []
+        gt_instance = []
+        for instance, type in vectors:
+            if type != -1:
+                gt_instance.append(instance)
+                gt_labels.append(type)
+        # import pdb;pdb.set_trace()
+        gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist,
+                        self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size)
+
+        anns_results = dict(
+            gt_vecs_pts_loc=gt_instance,
+            gt_vecs_label=gt_labels,
+
+        )
+        # import pdb;pdb.set_trace()
+        return anns_results
+    def proc_polygon(self, polygon, ego_SE3_city):
+        # import pdb;pdb.set_trace()
+        interiors = []
+        exterior_cityframe = np.array(list(polygon.exterior.coords))
+        exterior_egoframe = ego_SE3_city.transform_point_cloud(exterior_cityframe)
+        for inter in polygon.interiors:
+            inter_cityframe = np.array(list(inter.coords))
+            inter_egoframe = ego_SE3_city.transform_point_cloud(inter_cityframe)
+            interiors.append(inter_egoframe[:,:2])
+
+        new_polygon = Polygon(exterior_egoframe[:,:2], interiors)
+        return new_polygon
+
+    def get_map_boundary_geom(self, patch_box, patch_angle, avm, ego_SE3_city):
+        map_boundary_geom = []
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
+        # import pdb;pdb.set_trace()
+        polygon_list = []
+        for da in avm:
+            exterior_coords = da
+            # import pdb;pdb.set_trace()
+            interiors = []
+            # import pdb;pdb.set_trace()
+            is_polygon =  np.array_equal(exterior_coords[0],exterior_coords[-1])
+            if is_polygon:
+                polygon = Polygon(exterior_coords, interiors)
+            else:
+                polygon = LineString(exterior_coords)
+                raise ValueError(f'WRONG type: line in boundary')
+            if is_polygon:
+                if polygon.is_valid:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        # import pdb;pdb.set_trace()
+                        if new_polygon.geom_type == 'Polygon':
+                            if not new_polygon.is_valid:
+                                continue
+                            new_polygon = self.proc_polygon(new_polygon,ego_SE3_city)
+                            if not new_polygon.is_valid:
+                                continue
+                        elif new_polygon.geom_type == 'MultiPolygon':
+                            polygons = []
+                            for single_polygon in new_polygon.geoms:
+                                if not single_polygon.is_valid or single_polygon.is_empty:
+                                    continue
+                                new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city)
+                                if not new_single_polygon.is_valid:
+                                    continue
+                                polygons.append(new_single_polygon)
+                            if len(polygons) == 0:
+                                continue
+                            new_polygon = MultiPolygon(polygons)
+                            if not new_polygon.is_valid:
+                                continue
+                        else:
+                            raise ValueError('{} is not valid'.format(new_polygon.geom_type))
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+            else:
+                raise ValueError(f'WRONG type: line in boundary')
+        map_boundary_geom.append(('boundary',polygon_list))
+        return map_boundary_geom
+
+    def get_map_ped_geom(self, patch_box, patch_angle, avm, ego_SE3_city):
+        map_ped_geom = []
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
+        # import pdb;pdb.set_trace()
+        polygon_list = []
+        for pc in avm:
+            exterior_coords = pc
+            interiors = []
+            polygon = Polygon(exterior_coords, interiors)
+            if polygon.is_valid:
+                new_polygon = polygon.intersection(patch)
+                if not new_polygon.is_empty:
+                    if new_polygon.geom_type == 'Polygon':
+                        if not new_polygon.is_valid:
+                            continue
+                        new_polygon = self.proc_polygon(new_polygon,ego_SE3_city)
+                        if not new_polygon.is_valid:
+                            continue
+                    elif new_polygon.geom_type == 'MultiPolygon':
+                        polygons = []
+                        for single_polygon in new_polygon.geoms:
+                            if not single_polygon.is_valid or single_polygon.is_empty:
+                                continue
+                            new_single_polygon = self.proc_polygon(single_polygon,ego_SE3_city)
+                            if not new_single_polygon.is_valid:
+                                continue
+                            polygons.append(new_single_polygon)
+                        if len(polygons) == 0:
+                            continue
+                        new_polygon = MultiPolygon(polygons)
+                        if not new_polygon.is_valid:
+                            continue
+                    else:
+                        raise ValueError('{} is not valid'.format(new_polygon.geom_type))
+
+                    if new_polygon.geom_type == 'Polygon':
+                        new_polygon = MultiPolygon([new_polygon])
+                    polygon_list.append(new_polygon)
+        map_ped_geom.append(('ped_crossing',polygon_list))
+        return map_ped_geom
+
+    def proc_line(self, line,ego_SE3_city):
+        # import pdb;pdb.set_trace()
+        new_line_pts_cityframe = np.array(list(line.coords))
+        new_line_pts_egoframe = ego_SE3_city.transform_point_cloud(new_line_pts_cityframe)
+        line = LineString(new_line_pts_egoframe[:,:2]) #TODO
+        return line
+
+    def get_map_divider_geom(self, patch_box, patch_angle, avm, ego_SE3_city):
+        map_divider_geom = []
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
+        line_list = []
+        for ls in avm:
+            line = LineString(ls)
+            if line.is_empty:  # Skip lines without nodes.
+                continue
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                # import pdb;pdb.set_trace()
+                if new_line.geom_type == 'MultiLineString':
+                    for single_line in new_line.geoms:
+                        if single_line.is_empty:
+                            continue
+
+                        single_line = self.proc_line(single_line,ego_SE3_city)
+                        line_list.append(single_line)
+                else:
+                    new_line = self.proc_line(new_line, ego_SE3_city)
+                    line_list.append(new_line)
+        map_divider_geom.append(('divider',line_list))
+        return map_divider_geom
+
+
+    def _one_type_line_geom_to_instances(self, line_geom):
+        line_instances = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_instances.append(single_line)
+                elif line.geom_type == 'LineString':
+                    line_instances.append(line)
+                else:
+                    raise NotImplementedError
+        return line_instances
+
+
+    def ped_poly_geoms_to_instances(self, ped_geom):
+        ped = ped_geom[0][1]
+        # union_segments = ops.unary_union(ped)
+        # union_segments = MultiPolygon(ped)
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
+        exteriors = []
+        interiors = []
+        # if union_segments.geom_type != 'MultiPolygon':
+        #     union_segments = MultiPolygon([union_segments])
+        for segments in ped:
+            if segments.geom_type != 'MultiPolygon':
+                segments = MultiPolygon([segments])
+            for poly in segments.geoms:
+                exteriors.append(poly.exterior)
+                for inter in poly.interiors:
+                    interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+    def bound_poly_geoms_to_instances(self, polygon_geom):
+        # roads = polygon_geom[0][1]
+        # lanes = polygon_geom[1][1]
+        # union_roads = ops.unary_union(roads)
+        # union_lanes = ops.unary_union(lanes)
+        # union_segments = ops.unary_union([union_roads, union_lanes])
+        # import pdb;pdb.set_trace()
+        bounds = polygon_geom[0][1]
+
+        union_segments = ops.unary_union(bounds)
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+    def line_geoms_to_instances(self, line_geom):
+        lines = line_geom[0][1]
+        multiline = MultiLineString(lines)
+        union_lines = ops.unary_union(multiline)
+        if union_lines.geom_type == 'LineString':
+            return self._one_type_line_geom_to_instances([union_lines])
+        before_num = len(union_lines.geoms)
+        # import pdb;pdb.set_trace()
+        merged_lines = ops.linemerge(union_lines)
+        if merged_lines.geom_type == 'LineString':
+            return self._one_type_line_geom_to_instances([merged_lines])
+        after_num = len(merged_lines.geoms)
+        # import pdb;pdb.set_trace()
+        while after_num != before_num:
+            before_num = len(merged_lines.geoms)
+            merged_lines = ops.unary_union(merged_lines)
+            if merged_lines.geom_type == 'LineString':
+                break
+            merged_lines = ops.linemerge(merged_lines)
+            if merged_lines.geom_type == 'LineString':
+                break
+            after_num = len(merged_lines.geoms)
+        
+        return self._one_type_line_geom_to_instances([merged_lines])
+
+
+
+    def sample_pts_from_line(self, line):
+        if self.fixed_num < 0:
+            distances = np.arange(0, line.length, self.sample_dist)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+        else:
+            # fixed number of points, so distance is line.length / self.fixed_num
+            distances = np.linspace(0, line.length, self.fixed_num)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+
+        num_valid = len(sampled_points)
+
+        if not self.padding or self.fixed_num > 0:
+            # fixed num sample can return now!
+            return sampled_points, num_valid
+
+        # fixed distance sampling need padding!
+        num_valid = len(sampled_points)
+
+        if self.fixed_num < 0:
+            if num_valid < self.num_samples:
+                padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            else:
+                sampled_points = sampled_points[:self.num_samples, :]
+                num_valid = self.num_samples
+
+
+        return sampled_points, num_valid
+
+
+@DATASETS.register_module()
+class CustomAV2LocalMapDataset(CustomNuScenesDataset):
+    r"""NuScenes Dataset.
+
+    This datset add static map elements
+    """
+    MAPCLASSES = ('divider',)
+    def __init__(self,
+                 map_ann_file=None, 
+                 queue_length=4, 
+                 code_size=2,
+                 bev_size=(200, 200), 
+                 pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+                 overlap_test=False, 
+                 fixed_ptsnum_per_line=-1,
+                 eval_use_same_gt_sample_num_flag=False,
+                 padding_value=-10000,
+                 map_classes=None,
+                 *args, 
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.map_ann_file = map_ann_file
+
+        self.code_size = code_size
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+
+        self.MAPCLASSES = self.get_map_classes(map_classes)
+        self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+        self.pc_range = pc_range
+        patch_h = pc_range[4]-pc_range[1]
+        patch_w = pc_range[3]-pc_range[0]
+        self.patch_size = (patch_h, patch_w)
+        self.padding_value = padding_value
+        self.fixed_num = fixed_ptsnum_per_line
+        self.eval_use_same_gt_sample_num_flag = eval_use_same_gt_sample_num_flag
+        self.vector_map = VectorizedAV2LocalMap(kwargs['data_root'], 
+                            patch_size=self.patch_size, test_mode=self.test_mode, 
+                            map_classes=self.MAPCLASSES, 
+                            fixed_ptsnum_per_line=fixed_ptsnum_per_line,
+                            padding_value=self.padding_value)
+        self.is_vis_on_test = False
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        # import pdb;pdb.set_trace()
+        data = mmcv.load(ann_file)
+        # import pdb;pdb.set_trace()
+        data_infos = list(sorted(data['samples'], key=lambda e: e['timestamp']))
+        data_infos = data_infos[::self.load_interval]
+        # data_infos = [ data_info.update(dict(token= str(data_info['timestamp']+data_info['log_id'])))  for data_info in data_infos]
+        self.id2map = data['id2map']
+        self.metadata = None
+        self.version = None
+        return data_infos
+
+    @classmethod
+    def get_map_classes(cls, map_classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if map_classes is None:
+            return cls.MAPCLASSES
+
+        if isinstance(map_classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(map_classes)
+        elif isinstance(map_classes, (tuple, list)):
+            class_names = map_classes
+        else:
+            raise ValueError(f'Unsupported type {type(map_classes)} of map classes.')
+
+        return class_names
+    def vectormap_pipeline(self, example, input_dict):
+        '''
+        `example` type: <class 'dict'>
+            keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img';
+                  all keys type is 'DataContainer';
+                  'img_metas' cpu_only=True, type is dict, others are false;
+                  'gt_labels_3d' shape torch.size([num_samples]), stack=False,
+                                padding_value=0, cpu_only=False
+                  'gt_bboxes_3d': stack=False, cpu_only=True
+        '''
+        # import pdb;pdb.set_trace()
+        location = input_dict['log_id']
+        e2g_translation = input_dict['e2g_translation']
+        e2g_rotation = input_dict['e2g_rotation']
+        map_elements = self.id2map[location]
+        anns_results = self.vector_map.gen_vectorized_samples(location, map_elements, e2g_translation, e2g_rotation)
+        
+        '''
+        anns_results, type: dict
+            'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+            'gt_vecs_pts_num': list[num_vecs], vec with num_points
+            'gt_vecs_label': list[num_vecs], vec with cls index
+        '''
+        gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+        if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+            gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+        else:
+            gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+            try:
+                gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+            except Exception as e:
+                # empty tensor, will be passed in train, 
+                # but we preserve it for test
+                
+                # import pdb;pdb.set_trace()
+                gt_vecs_pts_loc = gt_vecs_pts_loc
+        # import ipdb;ipdb.set_trace()
+        example['gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False)
+        example['gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True)
+        # import pdb;pdb.set_trace()
+        # if self.is_vis_on_test:
+        #     lidar2global_translation = to_tensor(lidar2global_translation)
+        #     example['lidar2global_translation'] = DC(lidar2global_translation, cpu_only=True)
+        # else:
+        # example['img_metas'].data['lidar2global_translation'] = lidar2global_translation
+        return example
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = []
+
+        # temporal aug
+        prev_indexs_list = list(range(index-self.queue_length, index))
+        random.shuffle(prev_indexs_list)
+        prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+        ##
+
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        frame_idx = input_dict['timestamp']
+        scene_token = input_dict['log_id']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        # import pdb;pdb.set_trace()
+        example = self.vectormap_pipeline(example,input_dict)
+        if self.filter_empty_gt and \
+                (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        data_queue.insert(0, example)
+        return self.union2one(data_queue)
+
+    def union2one(self, queue):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if i == 0:
+                metas_map[i]['prev_bev'] = False
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            timestamp=info['timestamp'],
+            e2g_translation=info['e2g_translation'],
+            e2g_rotation=info['e2g_rotation'],
+            log_id=info['log_id'],
+            scene_token=info['log_id'],
+        )
+        if self.modality['use_camera']:
+            image_paths = []
+            cam_intrinsics = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_types = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['img_fpath'])
+                # camera intrinsics
+                camera_intrinsics = np.eye(4).astype(np.float32)
+                camera_intrinsics[:3, :3] = cam_info["intrinsics"]
+                # input_dict["camera_intrinsics"].append(camera_intrinsics)
+
+                # ego2img, ego = lidar
+                lidar2cam_rt = cam_info['extrinsics']
+                intrinsic = cam_info['intrinsics']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt)
+                lidar2img_rts.append(lidar2img_rt)
+                lidar2cam_rts.append(lidar2cam_rt)
+                cam_intrinsics.append(viewpad)
+                cam_types.append(cam_type)
+
+
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                    cam_type=cam_types,
+                ))
+        
+        if not self.test_mode:
+            # annos = self.get_ann_info(index)
+            input_dict['ann_info'] = dict()
+        
+        translation = input_dict['e2g_translation']
+        can_bus = np.ones(18)
+        # can_bus.extend(translation.tolist())
+        can_bus[:3] = translation
+        rotation = Quaternion._from_matrix(input_dict['e2g_rotation'])
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+        input_dict['can_bus'] = can_bus
+        # import pdb;pdb.set_trace()
+        return input_dict
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.is_vis_on_test:
+            example = self.vectormap_pipeline(example, input_dict)
+        return example
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+    def _format_gt(self):
+        gt_annos = []
+        # import pdb;pdb.set_trace()
+        print('Start to convert gt map format...')
+        assert self.map_ann_file is not None
+        if (not os.path.exists(self.map_ann_file)) :
+            dataset_length = len(self)
+            prog_bar = mmcv.ProgressBar(dataset_length)
+            mapped_class_names = self.MAPCLASSES
+            for sample_id in range(dataset_length):
+                sample_token = self.data_infos[sample_id]['token']
+                gt_anno = {}
+                gt_anno['sample_token'] = sample_token
+                # gt_sample_annos = []
+                gt_sample_dict = {}
+                gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id])
+                gt_labels = gt_sample_dict['gt_labels_3d'].data.numpy()
+                gt_vecs = gt_sample_dict['gt_bboxes_3d'].data.instance_list
+                # import pdb;pdb.set_trace()
+                gt_vec_list = []
+                for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+                    name = mapped_class_names[gt_label]
+                    anno = dict(
+                        pts=np.array(list(gt_vec.coords))[:,:self.code_size],
+                        pts_num=len(list(gt_vec.coords)),
+                        cls_name=name,
+                        type=gt_label,
+                    )
+                    gt_vec_list.append(anno)
+                gt_anno['vectors']=gt_vec_list
+                gt_annos.append(gt_anno)
+
+                prog_bar.update()
+            nusc_submissions = {
+                'GTs': gt_annos
+            }
+            print('\n GT anns writes to', self.map_ann_file)
+            mmcv.dump(nusc_submissions, self.map_ann_file)
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        assert self.map_ann_file is not None
+        pred_annos = []
+        mapped_class_names = self.MAPCLASSES
+        # import pdb;pdb.set_trace()
+        print('Start to convert map detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            pred_anno = {}
+            vecs = output_to_vecs(det)
+            sample_token = self.data_infos[sample_id]['token']
+            pred_anno['sample_token'] = sample_token
+            pred_vec_list=[]
+            for i, vec in enumerate(vecs):
+                name = mapped_class_names[vec['label']]
+                anno = dict(
+                    # sample_token=sample_token,
+                    pts=vec['pts'],
+                    pts_num=len(vec['pts']),
+                    cls_name=name,
+                    type=vec['label'],
+                    confidence_level=vec['score'])
+                pred_vec_list.append(anno)
+                # annos.append(nusc_anno)
+            # nusc_annos[sample_token] = annos
+            pred_anno['vectors'] = pred_vec_list
+            pred_annos.append(pred_anno)
+
+
+        if not os.path.exists(self.map_ann_file):
+            self._format_gt()
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+        # with open(self.map_ann_file,'r') as f:
+        #     GT_anns = json.load(f)
+        # gt_annos = GT_anns['GTs']
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': pred_annos,
+            # 'GTs': gt_annos
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'nuscmap_results.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def to_gt_vectors(self,
+                      gt_dict):
+        # import pdb;pdb.set_trace()
+        gt_labels = gt_dict['gt_labels_3d'].data
+        gt_instances = gt_dict['gt_bboxes_3d'].data.instance_list
+
+        gt_vectors = []
+
+        for gt_instance, gt_label in zip(gt_instances, gt_labels):
+            pts, pts_num = sample_pts_from_line(gt_instance, patch_size=self.patch_size)
+            gt_vectors.append({
+                'pts': pts,
+                'pts_num': pts_num,
+                'type': int(gt_label)
+            })
+        vector_num_list = {}
+        for i in range(self.NUM_MAPCLASSES):
+            vector_num_list[i] = []
+        for vec in gt_vectors:
+            if vector['pts_num'] >= 2:
+                vector_num_list[vector['type']].append((LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1)))
+        return gt_vectors
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='chamfer',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes
+        result_path = osp.abspath(result_path)
+        # import pdb;pdb.set_trace()
+        detail = dict()
+        
+        print('Formating results & gts by classes')
+        with open(result_path,'r') as f:
+            pred_results = json.load(f)
+        gen_results = pred_results['results']
+        with open(self.map_ann_file,'r') as ann_f:
+            gt_anns = json.load(ann_f)
+        annotations = gt_anns['GTs']
+        cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+                                                     gen_results,
+                                                     annotations,
+                                                     cls_names=self.MAPCLASSES,
+                                                     num_pred_pts_per_instance=self.fixed_num,
+                                                     eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag,
+                                                     pc_range=self.pc_range)
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['chamfer', 'iou']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        for metric in metrics:
+            print('-*'*10+f'use metric:{metric}'+'-*'*10)
+
+            if metric == 'chamfer':
+                thresholds = [0.5,1.0,1.5]
+            elif metric == 'iou':
+                thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+            cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+
+            for i, thr in enumerate(thresholds):
+                print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+                mAP, cls_ap = eval_map(
+                                gen_results,
+                                annotations,
+                                cls_gens,
+                                cls_gts,
+                                threshold=thr,
+                                cls_names=self.MAPCLASSES,
+                                logger=logger,
+                                num_pred_pts_per_instance=self.fixed_num,
+                                pc_range=self.pc_range,
+                                metric=metric)
+                for j in range(self.NUM_MAPCLASSES):
+                    cls_aps[i, j] = cls_ap[j]['ap']
+
+            for i, name in enumerate(self.MAPCLASSES):
+                print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+                detail['AV2Map_{}/{}_AP'.format(metric,name)] =  cls_aps.mean(0)[i]
+            print('map: {}'.format(cls_aps.mean(0).mean()))
+            detail['AV2Map_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+
+            for i, name in enumerate(self.MAPCLASSES):
+                for j, thr in enumerate(thresholds):
+                    if metric == 'chamfer':
+                        detail['AV2Map_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+                    elif metric == 'iou':
+                        if thr == 0.5 or thr == 0.75:
+                            detail['AV2Map_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+        return detail
+
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=('pts_bbox'),
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name], metric=metric)
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files, metric=metric)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+
+def output_to_vecs(detection):
+    box3d = detection['boxes_3d'].numpy()
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    pts = detection['pts_3d'].numpy()
+
+    vec_list = []
+    # import pdb;pdb.set_trace()
+    for i in range(box3d.shape[0]):
+        vec = dict(
+            bbox = box3d[i], # xyxy
+            label=labels[i],
+            score=scores[i],
+            pts=pts[i],
+        )
+        vec_list.append(vec)
+    return vec_list
+
+def sample_pts_from_line(line, 
+                         fixed_num=-1,
+                         sample_dist=1,
+                         normalize=False,
+                         patch_size=None,
+                         padding=False,
+                         num_samples=250,):
+    if fixed_num < 0:
+        distances = np.arange(0, line.length, sample_dist)
+        if line.has_z:
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 3)
+        else:
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+    else:
+        # fixed number of points, so distance is line.length / fixed_num
+        distances = np.linspace(0, line.length, fixed_num)
+        if line.has_z:
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 3)
+        else:
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+    if normalize:
+        sampled_points[:,:2] = sampled_points[:,:2] / np.array([patch_size[1], patch_size[0]])
+
+    num_valid = len(sampled_points)
+
+    if not padding or fixed_num > 0:
+        # fixed num sample can return now!
+        return sampled_points, num_valid
+
+    # fixed distance sampling need padding!
+    num_valid = len(sampled_points)
+
+    if fixed_num < 0:
+        if num_valid < num_samples:
+            padding = np.zeros((num_samples - len(sampled_points), sampled_points.shape[-1]))
+            sampled_points = np.concatenate([sampled_points, padding], axis=0)
+        else:
+            sampled_points = sampled_points[:num_samples, :]
+            num_valid = num_samples
+
+        if normalize:
+            sampled_points[:,:2] = sampled_points[:,:2] / np.array([patch_size[1], patch_size[0]])
+            num_valid = len(sampled_points)
+
+    return sampled_points[:,:2], num_valid
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba09708be83bca72fcbffa6ad057f7d1bbc59f3b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/builder.py
@@ -0,0 +1,146 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+
+from mmdet.datasets.samplers import GroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.group_sampler import DistributedGroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.distributed_sampler import DistributedSampler
+from projects.mmdet3d_plugin.datasets.samplers.sampler import build_sampler
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     shuffler_sampler=None,
+                     nonshuffler_sampler=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         samples_per_gpu=samples_per_gpu,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         seed=seed)
+                                     )
+
+        else:
+            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         shuffle=shuffle,
+                                         seed=seed)
+                                     )
+
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # assert False, 'not support in bevformer'
+        print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=True, #3
+        worker_init_fn=init_fn,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from mmcv.utils import Registry, build_from_cfg
+
+from mmdet.datasets import DATASETS
+from mmdet.datasets.builder import _concat_dataset
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+OBJECTSAMPLERS = Registry('Object sampler')
+
+
+def custom_build_dataset(cfg, default_args=None):
+    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82db209d7a4017ee5cc9740180931416a8724f85
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/__init__.py
@@ -0,0 +1 @@
+# from .CD_loss import MyChamferDistance
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..c868be978539aa4b0c676b9bc3fddf98766b6083
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from shapely.geometry import LineString, Polygon
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+import json
+from os import path as osp
+import os
+from functools import partial
+from .tpfp import custom_tpfp_gen
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+def get_cls_results(gen_results, 
+                    annotations, 
+                    num_sample=100, 
+                    num_pred_pts_per_instance=30,
+                    eval_use_same_gt_sample_num_flag=False,
+                    class_id=0, 
+                    fix_interval=False):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        gen_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes
+    """
+    # if len(gen_results) == 0 or 
+
+    cls_gens, cls_scores = [], []
+    for res in gen_results['vectors']:
+        if res['type'] == class_id:
+            if len(res['pts']) < 2:
+                continue
+            if not eval_use_same_gt_sample_num_flag:
+                sampled_points = np.array(res['pts'])
+            else:
+                line = res['pts']
+                line = LineString(line)
+
+                if fix_interval:
+                    distances = list(np.arange(1., line.length, 1.))
+                    distances = [0,] + distances + [line.length,]
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                            for distance in distances]).reshape(-1, 2)
+                else:
+                    distances = np.linspace(0, line.length, num_sample)
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                                for distance in distances]).reshape(-1, 2)
+                
+            cls_gens.append(sampled_points)
+            cls_scores.append(res['confidence_level'])
+    num_res = len(cls_gens)
+    if num_res > 0:
+        cls_gens = np.stack(cls_gens).reshape(num_res,-1)
+        cls_scores = np.array(cls_scores)[:,np.newaxis]
+        cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1)
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+    else:
+        if not eval_use_same_gt_sample_num_flag:
+            cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1))
+        else:
+            cls_gens = np.zeros((0,num_sample*2+1))
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+
+    cls_gts = []
+    for ann in annotations['vectors']:
+        if ann['type'] == class_id:
+            # line = ann['pts'] +  np.array((1,1)) # for hdmapnet
+            line = ann['pts']
+            # line = ann['pts'].cumsum(0)
+            line = LineString(line)
+            distances = np.linspace(0, line.length, num_sample)
+            sampled_points = np.array([list(line.interpolate(distance).coords)
+                                        for distance in distances]).reshape(-1, 2)
+            
+            cls_gts.append(sampled_points)
+    num_gts = len(cls_gts)
+    if num_gts > 0:
+        cls_gts = np.stack(cls_gts).reshape(num_gts,-1)
+    else:
+        cls_gts = np.zeros((0,num_sample*2))
+    return cls_gens, cls_gts
+    # ones = np.ones((num_gts,1))
+    # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1)
+    # return tmp_cls_gens, cls_gts
+
+def format_res_gt_by_classes(result_path,
+                             gen_results,
+                             annotations,
+                             cls_names=None,
+                             num_pred_pts_per_instance=30,
+                             eval_use_same_gt_sample_num_flag=False,
+                             pc_range=(-15.0, -30.0, -5.0, 15.0, 30.0, 3.0),
+                             nproc=24):
+    assert cls_names is not None
+    timer = mmcv.Timer()
+    num_fixed_sample_pts = 100
+    fix_interval = False
+    print('results path: {}'.format(result_path))
+
+    output_dir = osp.join(*osp.split(result_path)[:-1])
+    assert len(gen_results) == len(annotations)
+
+    pool = Pool(nproc)
+    cls_gens, cls_gts = {}, {}
+    print('Formatting ...')
+    formatting_file = 'cls_formatted.pkl'
+    formatting_file = osp.join(output_dir,formatting_file)
+
+    for i, clsname in enumerate(cls_names):
+
+        gengts = pool.starmap(
+                    partial(get_cls_results, num_sample=num_fixed_sample_pts,
+                        num_pred_pts_per_instance=num_pred_pts_per_instance,
+                        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval),
+                    zip(gen_results, annotations))   
+        # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval),
+        #             zip(gen_results, annotations))
+        # import pdb;pdb.set_trace()
+        gens, gts = tuple(zip(*gengts))
+        cls_gens[clsname] = gens
+        cls_gts[clsname] = gts
+    
+    mmcv.dump([cls_gens, cls_gts],formatting_file)
+    print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file))
+    pool.close()
+    return cls_gens, cls_gts
+
+def eval_map(gen_results,
+             annotations,
+             cls_gens,
+             cls_gts,
+             threshold=0.5,
+             cls_names=None,
+             logger=None,
+             tpfp_fn=None,
+             pc_range=(-15.0, -30.0, -5.0, 15.0, 30.0, 3.0),
+             metric=None,
+             num_pred_pts_per_instance=30,
+             nproc=24):
+    timer = mmcv.Timer()
+    pool = Pool(nproc)
+
+    eval_results = []
+    
+    for i, clsname in enumerate(cls_names):
+        
+        # get gt and det bboxes of this class
+        cls_gen = cls_gens[clsname]
+        cls_gt = cls_gts[clsname]
+        # choose proper function according to datasets to compute tp and fp
+        # XXX
+        # func_name = cls2func[clsname]
+        # tpfp_fn = tpfp_fn_dict[tpfp_fn_name]
+        tpfp_fn = custom_tpfp_gen
+        # Trick for serialized
+        # only top-level function can be serized
+        # somehow use partitial the return function is defined
+        # at the top level.
+
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric)
+        # import pdb; pdb.set_trace()
+        # TODO this is a hack
+        tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric)
+        args = []
+        # compute tp and fp for each image with multiple processes
+        tpfp = pool.starmap(
+            tpfp_fn,
+            zip(cls_gen, cls_gt, *args))
+        # import pdb;pdb.set_trace()
+        tp, fp = tuple(zip(*tpfp))
+
+
+
+        # map_results = map(
+        #     tpfp_fn,
+        #     cls_gen, cls_gt)
+        # tp, fp = tuple(map(list, zip(*map_results)))
+
+
+        # debug and testing
+        # for i in range(len(cls_gen)):
+        #     # print(i)
+        #     tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        #     print(i)
+        #     tpfp = (tpfp,)
+        #     print(tpfp)
+        # i = 0 
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        # import pdb; pdb.set_trace()
+
+        # XXX
+        
+        num_gts = 0
+        for j, bbox in enumerate(cls_gt):
+            num_gts += bbox.shape[0]
+
+        # sort all det bboxes by score, also sort tp and fp
+        # import pdb;pdb.set_trace()
+        cls_gen = np.vstack(cls_gen)
+        num_dets = cls_gen.shape[0]
+        sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front
+        tp = np.hstack(tp)[sort_inds]
+        fp = np.hstack(fp)[sort_inds]
+        
+        # calculate recall and precision with tp and fp
+        # num_det*num_res
+        tp = np.cumsum(tp, axis=0)
+        fp = np.cumsum(fp, axis=0)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts, eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+
+        # calculate AP
+        # if dataset != 'voc07' else '11points'
+        mode = 'area'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+        print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check())))
+    pool.close()
+    aps = []
+    for cls_result in eval_results:
+        if cls_result['num_gts'] > 0:
+            aps.append(cls_result['ap'])
+    mean_ap = np.array(aps).mean().item() if len(aps) else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, class_name=cls_names, logger=logger)
+
+    return mean_ap, eval_results
+
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      class_name=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    label_names = class_name
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ed7ab05c7579a861eb9c8e45eb4a44db9ac71d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py
@@ -0,0 +1,82 @@
+import mmcv
+import numpy as np
+
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from .tpfp_chamfer import custom_polyline_score
+from shapely.geometry import LineString, Polygon
+
+
+def custom_tpfp_gen(gen_lines,
+             gt_lines,
+             threshold=0.5,
+             metric='chamfer'):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+    if metric == 'chamfer':
+        if threshold >0:
+            threshold= -threshold
+    # else:
+    #     raise NotImplementedError
+
+    # import pdb;pdb.set_trace()
+    num_gens = gen_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+    
+    # tp and fp
+    tp = np.zeros((num_gens), dtype=np.float32)
+    fp = np.zeros((num_gens), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_gens == 0:
+        return tp, fp
+    
+    gen_scores = gen_lines[:,-1] # n
+    # distance matrix: n x m
+
+    matrix = custom_polyline_score(
+            gen_lines[:,:-1].reshape(num_gens,-1,2), 
+            gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-gen_scores)
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..16ebf40633a88a321978de4907e9305639a64cfd
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py
@@ -0,0 +1,127 @@
+# from ..chamfer_dist import ChamferDistance
+import numpy as np
+from shapely.geometry import LineString, Polygon
+from shapely.strtree import STRtree
+from shapely.geometry import CAP_STYLE, JOIN_STYLE
+from scipy.spatial import distance
+
+
+def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+    if metric == 'iou':
+        linewidth = 1.0
+    positive_threshold = 1.
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+    line_length = pred_lines.shape[1]
+
+    # gt_lines = gt_lines + np.array((1.,1.))
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+
+    if metric=='chamfer':
+        chamfer_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='iou':
+        chamfer_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+    else:
+        raise NotImplementedError
+
+    for i, gt_line in enumerate(gt_lines_shapely):
+
+        for o in tree.query(gt_line):
+            if o.intersects(gt_line):
+                pred_id = index_by_id[id(o)]
+
+                if metric=='chamfer':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    # import pdb;pdb.set_trace()
+                    valid_ab = dist_mat.min(-1).mean()
+                    valid_ba = dist_mat.min(-2).mean()
+
+                    chamfer_matrix[pred_id, i] = -(valid_ba+valid_ab)/2
+                elif metric=='iou':
+                    inter = o.intersection(gt_line).area
+                    union = o.union(gt_line).area
+                    chamfer_matrix[pred_id, i] = inter / union
+
+    return chamfer_matrix
+
+if __name__ == '__main__':
+    import torch
+
+    line1 = torch.tensor([
+        [1, 5], [3, 5], [5, 5]
+    ])
+
+    line0 = torch.tensor([
+        [3, 6], [4, 8], [5, 6]
+    ])
+
+    line2 = torch.tensor([
+        [1, 4], [3, 4], [5, 4]
+    ])
+
+    line3 = torch.tensor([
+        [4, 4], [3, 3], [5, 3]
+    ])
+
+    gt = torch.stack((line2, line3), dim=0).type(torch.float32)
+    pred = torch.stack((line0, line1), dim=0).type(torch.float32)
+
+    # import ipdb; ipdb.set_trace()
+    import mmcv
+    # with mmcv.Timer():
+    #     gt = upsampler(gt, pts=10)
+    #     pred = upsampler(pred, pts=10)
+
+    import matplotlib.pyplot as plt
+    from shapely.geometry import LineString
+    from descartes import PolygonPatch
+    
+    iou_matrix = vec_iou(pred,gt)
+    print(iou_matrix)
+    # import pdb;pdb.set_trace()
+    score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer')
+    print(score_matrix)
+    fig, ax = plt.subplots()
+    for i in gt:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='red')
+        plt.plot(i[:, 0], i[:, 1], '-', color='red')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+        patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+    for i in pred:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='blue')
+        plt.plot(i[:, 0], i[:, 1], '-', color='blue')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+        patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+
+    ax.axis('equal')
+
+
+    plt.savefig('test3.png')    
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..02913ab15d03e8762d6eb4eec3b72b2890dd7432
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
@@ -0,0 +1,258 @@
+import copy
+
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+import mmcv
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    r"""NuScenes Dataset.
+
+    This datset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = []
+
+        # temporal aug
+        prev_indexs_list = list(range(index-self.queue_length, index))
+        random.shuffle(prev_indexs_list)
+        prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+        ##
+
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        frame_idx = input_dict['frame_idx']
+        scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.filter_empty_gt and \
+                (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        data_queue.insert(0, example)
+        for i in prev_indexs_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                if self.filter_empty_gt and \
+                        (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                    return None
+                frame_idx = input_dict['frame_idx']
+            data_queue.insert(0, copy.deepcopy(example))
+        return self.union2one(data_queue)
+
+    def union2one(self, queue):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if i == 0:
+                metas_map[i]['prev_bev'] = False
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=True)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5becf7c79f2001ca9cab224c26dc43ade2ff4cf2
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_map_dataset.py
@@ -0,0 +1,1580 @@
+import copy
+
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+import mmcv
+import os
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+
+from .nuscenes_dataset import CustomNuScenesDataset
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from mmdet.datasets.pipelines import to_tensor
+import json
+
+
+def add_rotation_noise(extrinsics, std=0.01, mean=0.0):
+    #n = extrinsics.shape[0]
+    noise_angle = torch.normal(mean, std=std, size=(3,))
+    # extrinsics[:, 0:3, 0:3] *= (1 + noise)
+    sin_noise = torch.sin(noise_angle)
+    cos_noise = torch.cos(noise_angle)
+    rotation_matrix = torch.eye(4).view(4, 4)
+    #  rotation_matrix[]
+    rotation_matrix_x = rotation_matrix.clone()
+    rotation_matrix_x[1, 1] = cos_noise[0]
+    rotation_matrix_x[1, 2] = sin_noise[0]
+    rotation_matrix_x[2, 1] = -sin_noise[0]
+    rotation_matrix_x[2, 2] = cos_noise[0]
+
+    rotation_matrix_y = rotation_matrix.clone()
+    rotation_matrix_y[0, 0] = cos_noise[1]
+    rotation_matrix_y[0, 2] = -sin_noise[1]
+    rotation_matrix_y[2, 0] = sin_noise[1]
+    rotation_matrix_y[2, 2] = cos_noise[1]
+
+    rotation_matrix_z = rotation_matrix.clone()
+    rotation_matrix_z[0, 0] = cos_noise[2]
+    rotation_matrix_z[0, 1] = sin_noise[2]
+    rotation_matrix_z[1, 0] = -sin_noise[2]
+    rotation_matrix_z[1, 1] = cos_noise[2]
+
+    rotation_matrix = rotation_matrix_x @ rotation_matrix_y @ rotation_matrix_z
+
+    rotation = torch.from_numpy(extrinsics.astype(np.float32))
+    rotation[:3, -1] = 0.0
+    # import pdb;pdb.set_trace()
+    rotation = rotation_matrix @ rotation
+    extrinsics[:3, :3] = rotation[:3, :3].numpy()
+    return extrinsics
+
+
+def add_translation_noise(extrinsics, std=0.01, mean=0.0):
+    # n = extrinsics.shape[0]
+    noise = torch.normal(mean, std=std, size=(3,))
+    extrinsics[0:3, -1] += noise.numpy()
+    return extrinsics
+
+class LiDARInstanceLines(object):
+    """Line instance in LIDAR coordinates
+
+    """
+    def __init__(self, 
+                 instance_line_list, 
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_num=-1,
+                 padding_value=-10000,
+                 patch_size=None):
+        assert isinstance(instance_line_list, list)
+        assert patch_size is not None
+        if len(instance_line_list) != 0:
+            assert isinstance(instance_line_list[0], LineString)
+        self.patch_size = patch_size
+        self.max_x = self.patch_size[1] / 2
+        self.max_y = self.patch_size[0] / 2
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_num
+        self.padding_value = padding_value
+
+        self.instance_list = instance_line_list
+
+    @property
+    def start_end_points(self):
+        """
+        return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+        """
+        assert len(self.instance_list) != 0
+        instance_se_points_list = []
+        for instance in self.instance_list:
+            se_points = []
+            se_points.extend(instance.coords[0])
+            se_points.extend(instance.coords[-1])
+            instance_se_points_list.append(se_points)
+        instance_se_points_array = np.array(instance_se_points_list)
+        instance_se_points_tensor = to_tensor(instance_se_points_array)
+        instance_se_points_tensor = instance_se_points_tensor.to(
+                                dtype=torch.float32)
+        instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_se_points_tensor
+
+    @property
+    def bbox(self):
+        """
+        return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+        """
+        assert len(self.instance_list) != 0
+        instance_bbox_list = []
+        for instance in self.instance_list:
+            # bounds is bbox: [xmin, ymin, xmax, ymax]
+            instance_bbox_list.append(instance.bounds)
+        instance_bbox_array = np.array(instance_bbox_list)
+        instance_bbox_tensor = to_tensor(instance_bbox_array)
+        instance_bbox_tensor = instance_bbox_tensor.to(
+                            dtype=torch.float32)
+        instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_bbox_tensor
+
+    @property
+    def fixed_num_sampled_points(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_ambiguity(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        instance_points_tensor = instance_points_tensor.unsqueeze(1)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_torch(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # distances = np.linspace(0, instance.length, self.fixed_num)
+            # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            poly_pts = to_tensor(np.array(list(instance.coords)))
+            poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+            sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+            sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+            instance_points_list.append(sampled_pts)
+        # instance_points_array = np.array(instance_points_list)
+        # instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = torch.stack(instance_points_list,dim=0)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def shift_fixed_num_sampled_points(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v1(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v2(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+
+            if shifts_num > final_shift_num:
+                index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+                multi_shifts_pts = multi_shifts_pts[index]
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+                padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v3(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+            if shifts_num > 2*final_shift_num:
+                index = np.random.choice(shift_num, final_shift_num, replace=False)
+                flip0_shifts_pts = multi_shifts_pts[index]
+                flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+                multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+                padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v4(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        for fixed_num_pts in fixed_num_sampled_points:
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+                flip_pts_to_shift = pts_to_shift.flip(0)
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v5(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            is_poly = np.equal(sampled_points[0], sampled_points[-1])
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = sampled_points[:-1,:]
+                for shift_right_i in range(0, self.fixed_num, 2):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    shift_pts = np.vstack((shift_pts, shift_pts[0]))
+                    shift_pts_list.append(shift_pts)
+                    shift_pts_list.append(np.flip(shift_pts, axis=0))
+            else:
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(np.flip(sampled_points, axis=0))
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < self.fixed_num:
+                padding = torch.full([self.fixed_num - multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_torch(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+        instances_list = []
+        is_poly = False
+
+        for fixed_num_pts in fixed_num_sampled_points:
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+
+
+class VectorizedLocalMap(object):
+    CLASS2LABEL = {
+        'road_divider': 0,
+        'lane_divider': 0,
+        'ped_crossing': 1,
+        'contours': 2,
+        'others': -1
+    }
+    def __init__(self,
+                 dataroot,
+                 patch_size,
+                 map_classes=('divider','ped_crossing','boundary'),
+                 line_classes=('road_divider', 'lane_divider'),
+                 ped_crossing_classes=('ped_crossing'),
+                 contour_classes=('road_segment', 'lane'),
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_ptsnum_per_line=-1,
+                 padding_value=-10000,):
+        '''
+        Args:
+            fixed_ptsnum_per_line = -1 : no fixed num
+        '''
+        super().__init__()
+        self.data_root = dataroot
+        self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+                     'singapore-onenorth', 'singapore-queenstown']
+        self.vec_classes = map_classes
+        self.line_classes = line_classes
+        self.ped_crossing_classes = ped_crossing_classes
+        self.polygon_classes = contour_classes
+        self.nusc_maps = {}
+        self.map_explorer = {}
+        for loc in self.MAPS:
+            self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+            self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+        self.patch_size = patch_size
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_ptsnum_per_line
+        self.padding_value = padding_value
+
+    def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation):
+        '''
+        use lidar2global to get gt map layers
+        '''
+        
+        map_pose = lidar2global_translation[:2]
+        rotation = Quaternion(lidar2global_rotation)
+
+        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        vectors = []
+        for vec_class in self.vec_classes:
+            if vec_class == 'divider':
+                line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+                line_instances_dict = self.line_geoms_to_instances(line_geom)     
+                for line_type, instances in line_instances_dict.items():
+                    for instance in instances:
+                        vectors.append((instance, self.CLASS2LABEL.get(line_type, -1)))
+            elif vec_class == 'ped_crossing':
+                ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+                ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
+                for instance in ped_instance_list:
+                    vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
+            elif vec_class == 'boundary':
+                polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+                poly_bound_list = self.poly_geoms_to_instances(polygon_geom)
+                for contour in poly_bound_list:
+                    vectors.append((contour, self.CLASS2LABEL.get('contours', -1)))
+            else:
+                raise ValueError(f'WRONG vec_class: {vec_class}')
+
+        filtered_vectors = []
+        gt_pts_loc_3d = []
+        gt_pts_num_3d = []
+        gt_labels = []
+        gt_instance = []
+        for instance, type in vectors:
+            if type != -1:
+                gt_instance.append(instance)
+                gt_labels.append(type)
+        
+        gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist,
+                        self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size)
+
+        anns_results = dict(
+            gt_vecs_pts_loc=gt_instance,
+            gt_vecs_label=gt_labels,
+
+        )
+        return anns_results
+
+    def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+        map_geom = []
+        for layer_name in layer_names:
+            if layer_name in self.line_classes:
+                geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.polygon_classes:
+                geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.ped_crossing_classes:
+                geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+                map_geom.append((layer_name, geoms))
+        return map_geom
+
+    def _one_type_line_geom_to_vectors(self, line_geom):
+        line_vectors = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_vectors.append(self.sample_pts_from_line(single_line))
+                elif line.geom_type == 'LineString':
+                    line_vectors.append(self.sample_pts_from_line(line))
+                else:
+                    raise NotImplementedError
+        return line_vectors
+
+    def _one_type_line_geom_to_instances(self, line_geom):
+        line_instances = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_instances.append(single_line)
+                elif line.geom_type == 'LineString':
+                    line_instances.append(line)
+                else:
+                    raise NotImplementedError
+        return line_instances
+
+    def poly_geoms_to_vectors(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def ped_poly_geoms_to_instances(self, ped_geom):
+        ped = ped_geom[0][1]
+        union_segments = ops.unary_union(ped)
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+
+    def poly_geoms_to_instances(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+    def line_geoms_to_vectors(self, line_geom):
+        line_vectors_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+            line_vectors_dict[line_type] = one_type_vectors
+
+        return line_vectors_dict
+    def line_geoms_to_instances(self, line_geom):
+        line_instances_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines)
+            line_instances_dict[line_type] = one_type_instances
+
+        return line_instances_dict
+
+    def ped_geoms_to_vectors(self, ped_geom):
+        ped_geom = ped_geom[0][1]
+        union_ped = ops.unary_union(ped_geom)
+        if union_ped.geom_type != 'MultiPolygon':
+            union_ped = MultiPolygon([union_ped])
+
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        results = []
+        for ped_poly in union_ped:
+            # rect = ped_poly.minimum_rotated_rectangle
+            ext = ped_poly.exterior
+            if not ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def get_contour_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+
+        polygon_list = []
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+                for polygon in polygons:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        else:
+            for record in records:
+                polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+
+                if polygon.is_valid:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def get_divider_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        if layer_name == 'traffic_light':
+            return None
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        line_list = []
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+        for record in records:
+            line = self.map_explorer[location].map_api.extract_line(record['line_token'])
+            if line.is_empty:  # Skip lines without nodes.
+                continue
+
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+                new_line = affinity.affine_transform(new_line,
+                                                     [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                line_list.append(new_line)
+
+        return line_list
+
+    def get_ped_crossing_line(self, patch_box, patch_angle, location):
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+        polygon_list = []
+        records = getattr(self.map_explorer[location].map_api, 'ped_crossing')
+        # records = getattr(self.nusc_maps[location], 'ped_crossing')
+        for record in records:
+            polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+            if polygon.is_valid:
+                new_polygon = polygon.intersection(patch)
+                if not new_polygon.is_empty:
+                    new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                    new_polygon = affinity.affine_transform(new_polygon,
+                                                            [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                    if new_polygon.geom_type == 'Polygon':
+                        new_polygon = MultiPolygon([new_polygon])
+                    polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def sample_pts_from_line(self, line):
+        if self.fixed_num < 0:
+            distances = np.arange(0, line.length, self.sample_dist)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+        else:
+            # fixed number of points, so distance is line.length / self.fixed_num
+            distances = np.linspace(0, line.length, self.fixed_num)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+
+        num_valid = len(sampled_points)
+
+        if not self.padding or self.fixed_num > 0:
+            return sampled_points, num_valid
+
+        # fixed distance sampling need padding!
+        num_valid = len(sampled_points)
+
+        if self.fixed_num < 0:
+            if num_valid < self.num_samples:
+                padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            else:
+                sampled_points = sampled_points[:self.num_samples, :]
+                num_valid = self.num_samples
+
+
+        return sampled_points, num_valid
+
+
+@DATASETS.register_module()
+class CustomNuScenesLocalMapDataset(CustomNuScenesDataset):
+    r"""NuScenes Dataset.
+
+    This datset add static map elements
+    """
+    MAPCLASSES = ('divider',)
+    def __init__(self,
+                 map_ann_file=None, 
+                 queue_length=4, 
+                 bev_size=(200, 200), 
+                 pc_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
+                 overlap_test=False, 
+                 fixed_ptsnum_per_line=-1,
+                 eval_use_same_gt_sample_num_flag=False,
+                 padding_value=-10000,
+                 map_classes=None,
+                 noise='None',
+                 noise_std=0,
+                 *args, 
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.map_ann_file = map_ann_file
+
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+
+        self.MAPCLASSES = self.get_map_classes(map_classes)
+        self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+        self.pc_range = pc_range
+        patch_h = pc_range[4]-pc_range[1]
+        patch_w = pc_range[3]-pc_range[0]
+        self.patch_size = (patch_h, patch_w)
+        self.padding_value = padding_value
+        self.fixed_num = fixed_ptsnum_per_line
+        self.eval_use_same_gt_sample_num_flag = eval_use_same_gt_sample_num_flag
+        self.vector_map = VectorizedLocalMap(kwargs['data_root'], 
+                            patch_size=self.patch_size, map_classes=self.MAPCLASSES, 
+                            fixed_ptsnum_per_line=fixed_ptsnum_per_line,
+                            padding_value=self.padding_value)
+        self.is_vis_on_test = False
+        self.noise = noise
+        self.noise_std = noise_std
+    @classmethod
+    def get_map_classes(cls, map_classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if map_classes is None:
+            return cls.MAPCLASSES
+
+        if isinstance(map_classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(map_classes)
+        elif isinstance(map_classes, (tuple, list)):
+            class_names = map_classes
+        else:
+            raise ValueError(f'Unsupported type {type(map_classes)} of map classes.')
+
+        return class_names
+    def vectormap_pipeline(self, example, input_dict):
+        '''
+        `example` type: <class 'dict'>
+            keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img';
+                  all keys type is 'DataContainer';
+                  'img_metas' cpu_only=True, type is dict, others are false;
+                  'gt_labels_3d' shape torch.size([num_samples]), stack=False,
+                                padding_value=0, cpu_only=False
+                  'gt_bboxes_3d': stack=False, cpu_only=True
+        '''
+        # import pdb;pdb.set_trace()
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+
+        lidar2global = ego2global @ lidar2ego
+
+        lidar2global_translation = list(lidar2global[:3,3])
+        lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+
+        location = input_dict['map_location']
+        ego2global_translation = input_dict['ego2global_translation']
+        ego2global_rotation = input_dict['ego2global_rotation']
+        anns_results = self.vector_map.gen_vectorized_samples(location, lidar2global_translation, lidar2global_rotation)
+        
+        '''
+        anns_results, type: dict
+            'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+            'gt_vecs_pts_num': list[num_vecs], vec with num_points
+            'gt_vecs_label': list[num_vecs], vec with cls index
+        '''
+        gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+        if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+            gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+        else:
+            gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+            try:
+                gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+            except Exception as e:
+                # empty tensor, will be passed in train, 
+                # but we preserve it for test
+                gt_vecs_pts_loc = gt_vecs_pts_loc
+        example['gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False)
+        example['gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True)
+        return example
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+
+        index_list = list(range(index-self.queue_length, index))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            example = self.vectormap_pipeline(example,input_dict)
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+
+    def union2one(self, queue):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['scene_token'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['scene_token']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            lidar_path=info["lidar_path"],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            lidar2ego_translation=info['lidar2ego_translation'],
+            lidar2ego_rotation=info['lidar2ego_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'],
+            map_location = info['map_location'],
+        )
+        # lidar to ego transform
+        lidar2ego = np.eye(4).astype(np.float32)
+        lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix
+        lidar2ego[:3, 3] = info["lidar2ego_translation"]
+        input_dict["lidar2ego"] = lidar2ego
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            input_dict["camera2ego"] = []
+            input_dict["camera_intrinsics"] = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                lidar2cam_rt_t = lidar2cam_rt.T
+
+                if self.noise == 'rotation':
+                    lidar2cam_rt_t = add_rotation_noise(lidar2cam_rt_t, std=self.noise_std)
+                elif self.noise == 'translation':
+                    lidar2cam_rt_t = add_translation_noise(
+                        lidar2cam_rt_t, std=self.noise_std)
+
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt_t)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt_t)
+
+                # camera to ego transform
+                camera2ego = np.eye(4).astype(np.float32)
+                camera2ego[:3, :3] = Quaternion(
+                    cam_info["sensor2ego_rotation"]
+                ).rotation_matrix
+                camera2ego[:3, 3] = cam_info["sensor2ego_translation"]
+                input_dict["camera2ego"].append(camera2ego)
+
+                # camera intrinsics
+                camera_intrinsics = np.eye(4).astype(np.float32)
+                camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"]
+                input_dict["camera_intrinsics"].append(camera_intrinsics)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+        lidar2global = ego2global @ lidar2ego
+        input_dict['lidar2global'] = lidar2global
+        return input_dict
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.is_vis_on_test:
+            example = self.vectormap_pipeline(example, input_dict)
+        return example
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+    def _format_gt(self):
+        gt_annos = []
+        print('Start to convert gt map format...')
+        assert self.map_ann_file is not None
+        if (not os.path.exists(self.map_ann_file)) :
+            dataset_length = len(self)
+            prog_bar = mmcv.ProgressBar(dataset_length)
+            mapped_class_names = self.MAPCLASSES
+            for sample_id in range(dataset_length):
+                sample_token = self.data_infos[sample_id]['token']
+                gt_anno = {}
+                gt_anno['sample_token'] = sample_token
+                # gt_sample_annos = []
+                gt_sample_dict = {}
+                gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id])
+                gt_labels = gt_sample_dict['gt_labels_3d'].data.numpy()
+                gt_vecs = gt_sample_dict['gt_bboxes_3d'].data.instance_list
+                gt_vec_list = []
+                for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+                    name = mapped_class_names[gt_label]
+                    anno = dict(
+                        pts=np.array(list(gt_vec.coords)),
+                        pts_num=len(list(gt_vec.coords)),
+                        cls_name=name,
+                        type=gt_label,
+                    )
+                    gt_vec_list.append(anno)
+                gt_anno['vectors']=gt_vec_list
+                gt_annos.append(gt_anno)
+
+                prog_bar.update()
+            nusc_submissions = {
+                'GTs': gt_annos
+            }
+            print('\n GT anns writes to', self.map_ann_file)
+            mmcv.dump(nusc_submissions, self.map_ann_file)
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        assert self.map_ann_file is not None
+        pred_annos = []
+        mapped_class_names = self.MAPCLASSES
+        # import pdb;pdb.set_trace()
+        print('Start to convert map detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            pred_anno = {}
+            vecs = output_to_vecs(det)
+            sample_token = self.data_infos[sample_id]['token']
+            pred_anno['sample_token'] = sample_token
+            pred_vec_list=[]
+            for i, vec in enumerate(vecs):
+                name = mapped_class_names[vec['label']]
+                anno = dict(
+                    pts=vec['pts'],
+                    pts_num=len(vec['pts']),
+                    cls_name=name,
+                    type=vec['label'],
+                    confidence_level=vec['score'])
+                pred_vec_list.append(anno)
+
+            pred_anno['vectors'] = pred_vec_list
+            pred_annos.append(pred_anno)
+
+
+        if not os.path.exists(self.map_ann_file):
+            self._format_gt()
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': pred_annos,
+
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'nuscmap_results.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def to_gt_vectors(self,
+                      gt_dict):
+        # import pdb;pdb.set_trace()
+        gt_labels = gt_dict['gt_labels_3d'].data
+        gt_instances = gt_dict['gt_bboxes_3d'].data.instance_list
+
+        gt_vectors = []
+
+        for gt_instance, gt_label in zip(gt_instances, gt_labels):
+            pts, pts_num = sample_pts_from_line(gt_instance, patch_size=self.patch_size)
+            gt_vectors.append({
+                'pts': pts,
+                'pts_num': pts_num,
+                'type': int(gt_label)
+            })
+        vector_num_list = {}
+        for i in range(self.NUM_MAPCLASSES):
+            vector_num_list[i] = []
+        for vec in gt_vectors:
+            if vector['pts_num'] >= 2:
+                vector_num_list[vector['type']].append((LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1)))
+        return gt_vectors
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='chamfer',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes
+        result_path = osp.abspath(result_path)
+        detail = dict()
+        
+        print('Formating results & gts by classes')
+        with open(result_path,'r') as f:
+            pred_results = json.load(f)
+        gen_results = pred_results['results']
+        with open(self.map_ann_file,'r') as ann_f:
+            gt_anns = json.load(ann_f)
+        annotations = gt_anns['GTs']
+        cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+                                                     gen_results,
+                                                     annotations,
+                                                     cls_names=self.MAPCLASSES,
+                                                     num_pred_pts_per_instance=self.fixed_num,
+                                                     eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag,
+                                                     pc_range=self.pc_range)
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['chamfer', 'iou']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        for metric in metrics:
+            print('-*'*10+f'use metric:{metric}'+'-*'*10)
+
+            if metric == 'chamfer':
+                thresholds = [0.5,1.0,1.5]
+            elif metric == 'iou':
+                thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+            cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+
+            for i, thr in enumerate(thresholds):
+                print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+                mAP, cls_ap = eval_map(
+                                gen_results,
+                                annotations,
+                                cls_gens,
+                                cls_gts,
+                                threshold=thr,
+                                cls_names=self.MAPCLASSES,
+                                logger=logger,
+                                num_pred_pts_per_instance=self.fixed_num,
+                                pc_range=self.pc_range,
+                                metric=metric)
+                for j in range(self.NUM_MAPCLASSES):
+                    cls_aps[i, j] = cls_ap[j]['ap']
+
+            for i, name in enumerate(self.MAPCLASSES):
+                print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+                detail['NuscMap_{}/{}_AP'.format(metric,name)] =  cls_aps.mean(0)[i]
+            print('map: {}'.format(cls_aps.mean(0).mean()))
+            detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+
+            for i, name in enumerate(self.MAPCLASSES):
+                for j, thr in enumerate(thresholds):
+                    if metric == 'chamfer':
+                        detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+                    elif metric == 'iou':
+                        if thr == 0.5 or thr == 0.75:
+                            detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+        return detail
+
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name], metric=metric)
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files, metric=metric)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+
+def output_to_vecs(detection):
+    box3d = detection['boxes_3d'].numpy()
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    pts = detection['pts_3d'].numpy()
+
+    vec_list = []
+    for i in range(box3d.shape[0]):
+        vec = dict(
+            bbox = box3d[i], # xyxy
+            label=labels[i],
+            score=scores[i],
+            pts=pts[i],
+        )
+        vec_list.append(vec)
+    return vec_list
+
+def sample_pts_from_line(line, 
+                         fixed_num=-1,
+                         sample_dist=1,
+                         normalize=False,
+                         patch_size=None,
+                         padding=False,
+                         num_samples=250,):
+    if fixed_num < 0:
+        distances = np.arange(0, line.length, sample_dist)
+        sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+    else:
+        # fixed number of points, so distance is line.length / fixed_num
+        distances = np.linspace(0, line.length, fixed_num)
+        sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+    if normalize:
+        sampled_points = sampled_points / np.array([patch_size[1], patch_size[0]])
+
+    num_valid = len(sampled_points)
+
+    if not padding or fixed_num > 0:
+        # fixed num sample can return now!
+        return sampled_points, num_valid
+
+    # fixed distance sampling need padding!
+    num_valid = len(sampled_points)
+
+    if fixed_num < 0:
+        if num_valid < num_samples:
+            padding = np.zeros((num_samples - len(sampled_points), 2))
+            sampled_points = np.concatenate([sampled_points, padding], axis=0)
+        else:
+            sampled_points = sampled_points[:num_samples, :]
+            num_valid = num_samples
+
+        if normalize:
+            sampled_points = sampled_points / np.array([patch_size[1], patch_size[0]])
+            num_valid = len(sampled_points)
+
+    return sampled_points, num_valid
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c76b9998b60e523b1b42e1603d6669faca7681b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py
@@ -0,0 +1,777 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import mmcv
+import numpy as np
+import pyquaternion
+import tempfile
+import torch
+import warnings
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import DATASETS, CocoDataset
+from mmdet3d.core import show_multi_modality_result
+from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type
+from mmdet3d.datasets.pipelines import Compose
+from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class CustomNuScenesMonoDataset(CocoDataset):
+    r"""Monocular 3D detection on NuScenes Dataset.
+    This class serves as the API for experiments on the NuScenes Dataset.
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+    Args:
+        ann_file (str): Path of annotation file.
+        data_root (str): Path of dataset root.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Camera' in this class. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+    """
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+
+    def __init__(self,
+                 data_root,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='Camera',
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False,
+                 overlap_test=False,
+                 version='v1.0-trainval',
+                 **kwargs):
+        super().__init__(**kwargs)
+        # overlap_test = True
+        self.data_root = data_root
+        self.overlap_test = overlap_test
+        self.load_interval = load_interval
+        self.with_velocity = with_velocity
+        self.modality = modality
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.eval_version = eval_version
+        self.use_valid_flag = use_valid_flag
+        self.bbox_code_size = 9
+        self.version = version
+        if self.eval_version is not None:
+            from nuscenes.eval.detection.config import config_factory
+            self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=True,
+                use_lidar=False,
+                use_radar=False,
+                use_map=False,
+                use_external=False)
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+        Args:
+            results (dict): Dict before data preprocessing.
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_prefix'] = ''  # self.img_prefix
+        # print('img_prefix', self.img_prefix)
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox annotation.
+        Args:
+            img_info (list[dict]): Image info.
+            ann_info (list[dict]): Annotation info of an image.
+        Returns:
+            dict: A dict containing the following keys: bboxes, labels, \
+                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
+                depths, bboxes_ignore, masks, seg_map
+        """
+        gt_bboxes = []
+        gt_labels = []
+        attr_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                attr_labels.append(ann['attribute_id'])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+                nan_mask = np.isnan(velo_cam3d[:, 0])
+                velo_cam3d[nan_mask] = [0.0, 0.0]
+                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            attr_labels = np.array(attr_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            attr_labels=attr_labels,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def get_attr_name(self, attr_idx, label_name):
+        """Get attribute from predicted index.
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one
+        in the attribute set. If it is consistent with the category, we will
+        keep it. Otherwise, we will use the default attribute.
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        else:
+            return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+
+        CAM_NUM = 6
+
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+            if sample_id % CAM_NUM == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+                                                  boxes, attrs,
+                                                  mapped_class_names,
+                                                  self.eval_detection_configs,
+                                                  self.eval_version)
+
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='img_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'img_bbox'.
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        #from nuscenes.eval.detection.evaluate import NuScenesEval
+        from .nuscnes_eval import NuScenesEval_custom
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        self.nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        # nusc_eval = NuScenesEval(
+        #     nusc,
+        #     config=self.eval_detection_configs,
+        #     result_path=result_path,
+        #     eval_set=eval_set_map[self.version],
+        #     output_dir=output_dir,
+        #     verbose=False)
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+            )
+
+        self.nusc_eval.main(render_curves=True)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                # not evaluate 2D predictions on nuScenes
+                if '2d' in name:
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['img_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        img_info = self.data_infos[index]
+        input_dict = dict(img_info=img_info)
+
+        if load_annos:
+            ann_info = self.get_ann_info(index)
+            input_dict.update(dict(ann_info=ann_info))
+
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+
+        return data
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+        Args:
+            pipeline (list[dict] | None): Input pipeline. If None is given, \
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(type='LoadImageFromFileMono3D'),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'img_bbox' in result.keys():
+                result = result['img_bbox']
+            data_info = self.data_infos[i]
+            img_path = data_info['file_name']
+            file_name = osp.split(img_path)[-1].split('.')[0]
+            img, img_metas = self._extract_data(i, pipeline,
+                                                ['img', 'img_metas'])
+            # need to transpose channel to first dim
+            img = img.numpy().transpose(1, 2, 0)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
+            pred_bboxes = result['boxes_3d']
+            show_multi_modality_result(
+                img,
+                gt_bboxes,
+                pred_bboxes,
+                img_metas['cam2img'],
+                out_dir,
+                file_name,
+                box_mode='camera',
+                show=show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+    Args:
+        detection (dict): Detection results.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+            - attrs_3d (torch.Tensor, optional): Predicted attributes.
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attrs_3d' in detection:
+        attrs = detection['attrs_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # convert the dim/rot to nuscbox convention
+    box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
+    box_yaw = -box_yaw
+
+    box_list = []
+    for i in range(len(box3d)):
+        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+        quat = q2 * q1
+        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+                           boxes,
+                           attrs,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from camera to global coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+        box.translate(np.array(info['cam2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+                           boxes,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from global to camera coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.translate(-np.array(info['ego2global_translation']))
+        box.rotate(
+            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        box.translate(-np.array(info['cam2ego_translation']))
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2ae34cde3fdc48981619f3fac4e65f599c8295f
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/nuscnes_eval.py
@@ -0,0 +1,751 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f6587b06835be928919aa6c4c74981468cc0a3c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
@@ -0,0 +1,10 @@
+from .transform_3d import (
+    PadMultiViewImage, NormalizeMultiviewImage, 
+    PhotoMetricDistortionMultiViewImage, CustomCollect3D, RandomScaleImageMultiViewImage, CustomPointsRangeFilter)
+from .formating import CustomDefaultFormatBundle3D
+
+from .loading import CustomLoadPointsFromFile, CustomLoadPointsFromMultiSweeps, CustomLoadMultiViewImageFromFiles
+__all__ = [
+    'PadMultiViewImage', 'NormalizeMultiviewImage', 
+    'PhotoMetricDistortionMultiViewImage', 'CustomDefaultFormatBundle3D', 'CustomCollect3D', 'RandomScaleImageMultiViewImage'
+]
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52a15c0a4a0063909b19907338808c06a820723
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/formating.py
@@ -0,0 +1,39 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+
+from mmdet3d.core.bbox import BaseInstance3DBoxes
+from mmdet3d.core.points import BasePoints
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import to_tensor
+from mmdet3d.datasets.pipelines import DefaultFormatBundle3D
+
+@PIPELINES.register_module()
+class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        results = super(CustomDefaultFormatBundle3D, self).__call__(results)
+        results['gt_map_masks'] = DC(
+            to_tensor(results['gt_map_masks']), stack=True)
+
+        return results
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b1b6ae3fe070d80a546d332cb26b3de466d004
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading.py
@@ -0,0 +1,368 @@
+import os
+from typing import Any, Dict, Tuple
+
+import mmcv
+import numpy as np
+from nuscenes.map_expansion.map_api import NuScenesMap
+from nuscenes.map_expansion.map_api import locations as LOCATIONS
+from PIL import Image
+
+
+from mmdet3d.core.points import BasePoints, get_points_type
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import LoadAnnotations
+
+from .loading_utils import load_augmented_point_cloud, reduce_LiDAR_beams
+
+
+@PIPELINES.register_module()
+class CustomLoadMultiViewImageFromFiles(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, padding=True,pad_val=128, color_type='unchanged'):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.padding = padding
+        self.pad_val = pad_val
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data. \
+                Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        filename = results['img_filename']
+        # img is of shape (h, w, c, num_views)
+        # img = np.stack(
+        #     [mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+        img_list = [mmcv.imread(name, self.color_type) for name in filename]
+        img_shape_list = [img.shape for img in img_list]
+        max_h = max([shape[0] for shape in img_shape_list])
+        max_w = max([shape[1] for shape in img_shape_list])
+        size = (max_h, max_w)
+        # import pdb;pdb.set_trace()
+        img_list = [mmcv.impad(
+                    img, shape=size, pad_val=self.pad_val) for img in img_list]
+
+        img = np.stack(img_list,axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromMultiSweeps:
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool): Whether to remove close points.
+            Defaults to False.
+        test_mode (bool): If test_model=True used for testing, it will not
+            randomly sample sweeps but select the nearest N frames.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        sweeps_num=10,
+        load_dim=5,
+        use_dim=(0, 1, 2, 4),
+        pad_empty_sweeps=False,
+        remove_close=False,
+        test_mode=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        self.use_dim = use_dim
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+        return points
+
+    def _remove_close(self, points, radius=1.0):
+        """Removes point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.tensor.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def __call__(self, results):
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud \
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data. \
+                Added key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+                    cloud arrays.
+        """
+        points = results["points"]
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results["timestamp"] / 1e6
+        if self.pad_empty_sweeps and len(results["sweeps"]) == 0:
+            for i in range(self.sweeps_num):
+                if self.remove_close:
+                    sweep_points_list.append(self._remove_close(points))
+                else:
+                    sweep_points_list.append(points)
+        else:
+            if len(results["sweeps"]) <= self.sweeps_num:
+                choices = np.arange(len(results["sweeps"]))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                # NOTE: seems possible to load frame -11?
+                if not self.load_augmented:
+                    choices = np.random.choice(
+                        len(results["sweeps"]), self.sweeps_num, replace=False
+                    )
+                else:
+                    # don't allow to sample the earliest frame, match with Tianwei's implementation.
+                    choices = np.random.choice(
+                        len(results["sweeps"]) - 1, self.sweeps_num, replace=False
+                    )
+            for idx in choices:
+                sweep = results["sweeps"][idx]
+                points_sweep = self._load_points(sweep["data_path"])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+
+                # TODO: make it more general
+                if self.reduce_beams and self.reduce_beams < 32:
+                    points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams)
+
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                sweep_ts = sweep["timestamp"] / 1e6
+                points_sweep[:, :3] = (
+                    points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T
+                )
+                points_sweep[:, :3] += sweep["sensor2lidar_translation"]
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results["points"] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})"
+
+
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromFile:
+    """Load Points From File.
+
+    Load sunrgbd and scannet points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int]): Which dimensions of the points to be used.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        coord_type,
+        load_dim=6,
+        use_dim=(0, 1, 2),
+        shift_height=False,
+        use_color=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert (
+            max(use_dim) < load_dim
+        ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
+        assert coord_type in ["CAMERA", "LIDAR", "DEPTH"]
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data. \
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        lidar_path = results["lidar_path"]
+        points = self._load_points(lidar_path)
+        points = points.reshape(-1, self.load_dim)
+        # TODO: make it more general
+        if self.reduce_beams and self.reduce_beams < 32:
+            points = reduce_LiDAR_beams(points, self.reduce_beams)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
+            )
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(
+                    color=[
+                        points.shape[1] - 3,
+                        points.shape[1] - 2,
+                        points.shape[1] - 1,
+                    ]
+                )
+            )
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims
+        )
+        results["points"] = points
+
+        return results
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c66dbb2d8cb0865d189d22ce17b91d73da342ffb
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/loading_utils.py
@@ -0,0 +1,107 @@
+import os
+
+import numpy as np
+import torch
+
+__all__ = ["load_augmented_point_cloud", "reduce_LiDAR_beams"]
+
+
+def load_augmented_point_cloud(path, virtual=False, reduce_beams=32):
+    # NOTE: following Tianwei's implementation, it is hard coded for nuScenes
+    points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)
+    # NOTE: path definition different from Tianwei's implementation.
+    tokens = path.split("/")
+    vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS"
+    seg_path = os.path.join(
+        *tokens[:-3],
+        "virtual_points",
+        tokens[-3],
+        tokens[-2] + vp_dir,
+        tokens[-1] + ".pkl.npy",
+    )
+    assert os.path.exists(seg_path)
+    data_dict = np.load(seg_path, allow_pickle=True).item()
+
+    virtual_points1 = data_dict["real_points"]
+    # NOTE: add zero reflectance to virtual points instead of removing them from real points
+    virtual_points2 = np.concatenate(
+        [
+            data_dict["virtual_points"][:, :3],
+            np.zeros([data_dict["virtual_points"].shape[0], 1]),
+            data_dict["virtual_points"][:, 3:],
+        ],
+        axis=-1,
+    )
+
+    points = np.concatenate(
+        [
+            points,
+            np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]),
+        ],
+        axis=1,
+    )
+    virtual_points1 = np.concatenate(
+        [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1
+    )
+    # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points.
+    if len(data_dict["real_points_indice"]) > 0:
+        points[data_dict["real_points_indice"]] = virtual_points1
+    if virtual:
+        virtual_points2 = np.concatenate(
+            [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1
+        )
+        points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32)
+    return points
+
+
+def reduce_LiDAR_beams(pts, reduce_beams_to=32):
+    # print(pts.size())
+    if isinstance(pts, np.ndarray):
+        pts = torch.from_numpy(pts)
+    radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2))
+    sine_theta = pts[:, 2] / radius
+    # [-pi/2, pi/2]
+    theta = torch.asin(sine_theta)
+    phi = torch.atan2(pts[:, 1], pts[:, 0])
+
+    top_ang = 0.1862
+    down_ang = -0.5353
+
+    beam_range = torch.zeros(32)
+    beam_range[0] = top_ang
+    beam_range[31] = down_ang
+
+    for i in range(1, 31):
+        beam_range[i] = beam_range[i - 1] - 0.023275
+    # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275,
+    #                -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54]
+
+    num_pts, _ = pts.size()
+    mask = torch.zeros(num_pts)
+    if reduce_beams_to == 16:
+        for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    elif reduce_beams_to == 4:
+        for id in [7, 9, 11, 13]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    # [?] pick the 14th beam
+    elif reduce_beams_to == 1:
+        chosen_beam_id = 9
+        mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * (
+            theta > (beam_range[chosen_beam_id] - 0.012)
+        )
+    else:
+        raise NotImplementedError
+    # points = copy.copy(pts)
+    points = pts[mask]
+    # print(points.size())
+    return points.numpy()
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..68a4e2ce5822a06bc7e475c1657a3eed646f5285
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
@@ -0,0 +1,355 @@
+import numpy as np
+from numpy import random
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from mmcv.parallel import DataContainer as DC
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [mmcv.impad(
+                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
+        elif self.size_divisor is not None:
+            padded_img = [mmcv.impad_to_multiple(
+                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
+        
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['img_shape'] = [img.shape for img in padded_img]
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+
+        results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, \
+                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                    self.brightness_delta)
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                            self.saturation_upper)
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+
+@PIPELINES.register_module()
+class CustomCollect3D(object):
+    """Collect data from the loader relevant to the specific task.
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow', 'scene_token',
+                            'can_bus','lidar2global',
+                            'camera2ego','camera_intrinsics','img_aug_matrix','lidar2ego'
+                            )):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+        Args:
+            results (dict): Result dict contains the data to collect.
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+       
+        data = {}
+        img_metas = {}
+      
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+
+@PIPELINES.register_module()
+class RandomScaleImageMultiViewImage(object):
+    """Random scale the image
+    Args:
+        scales
+    """
+
+    def __init__(self, scales=[]):
+        self.scales = scales
+        assert len(self.scales)==1
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        rand_ind = np.random.permutation(range(len(self.scales)))[0]
+        rand_scale = self.scales[rand_ind]
+
+        y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
+        x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
+        scale_factor = np.eye(4)
+        scale_factor[0, 0] *= rand_scale
+        scale_factor[1, 1] *= rand_scale
+        results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
+                          enumerate(results['img'])]
+        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+        img_aug_matrix = [scale_factor for _ in results['lidar2img']]
+        results['lidar2img'] = lidar2img
+        results['img_aug_matrix'] = img_aug_matrix
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['ori_shape'] = [img.shape for img in results['img']]
+
+        return results
+
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.scales}, '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CustomPointsRangeFilter:
+    """Filter points by the range.
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, data):
+        """Call function to filter points by the range.
+        Args:
+            data (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = data["points"]
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        data["points"] = clean_points
+        return data
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2a0b17769a958042583dcb4c8c4a4f51636f4c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/__init__.py
@@ -0,0 +1,4 @@
+from .group_sampler import DistributedGroupSampler
+from .distributed_sampler import DistributedSampler
+from .sampler import SAMPLER, build_sampler
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2913de99253be744a308bbc24c5bcaf3cd4a857c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from .sampler import SAMPLER
+
+
+@SAMPLER.register_module()
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset=None,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            assert False
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        per_replicas = self.total_size//self.num_replicas
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c59e5f3dd880ba185247acfba6eae354deb771
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
@@ -0,0 +1,110 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+from .sampler import SAMPLER
+import random
+from IPython import embed
+
+
+@SAMPLER.register_module()
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..30276cff64d0df162daa9094f6f46fdb07848132
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/datasets/samplers/sampler.py
@@ -0,0 +1,7 @@
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+    return build_from_cfg(cfg, SAMPLER, default_args)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..df5f2d359f4cd5292cce95c32b859abbab97529c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/__init__.py
@@ -0,0 +1,5 @@
+from .assigners import *
+from .dense_heads import *
+from .detectors import *
+from .modules import *
+from .losses import *
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6c4b0ff320a2a38821dba476c16729d25246feb
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/__init__.py
@@ -0,0 +1 @@
+from .maptr_assigner import MapTRAssigner
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d3c9090680df4c7377dd2672aa0b21d60816cd
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/assigners/maptr_assigner.py
@@ -0,0 +1,196 @@
+import torch
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+import torch.nn.functional as F
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+def normalize_2d_bbox(bboxes, pc_range):
+
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes)
+    cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0]
+    cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1]
+    factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h])
+
+    normalized_bboxes = cxcywh_bboxes / factor
+    return normalized_bboxes
+
+def normalize_2d_pts(pts, pc_range):
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    new_pts = pts.clone()
+    new_pts[...,0:1] = pts[..., 0:1] - pc_range[0]
+    new_pts[...,1:2] = pts[...,1:2] - pc_range[1]
+    factor = pts.new_tensor([patch_w, patch_h])
+    normalized_pts = new_pts / factor
+    return normalized_pts
+
+def denormalize_2d_bbox(bboxes, pc_range):
+
+    bboxes = bbox_cxcywh_to_xyxy(bboxes)
+    bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+
+    return bboxes
+def denormalize_2d_pts(pts, pc_range):
+    new_pts = pts.clone()
+    new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+    return new_pts
+
+@BBOX_ASSIGNERS.register_module()
+class MapTRAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pts_cost = build_match_cost(pts_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               pts_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_pts,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        assert bbox_pred.shape[-1] == 4, \
+            'Only support bbox pred shape is 4 dims'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels), None
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        
+        normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range)
+        # normalized_gt_bboxes = gt_bboxes
+        # import pdb;pdb.set_trace()
+        reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4])
+
+        _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape
+        normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range)
+        num_pts_per_predline = pts_pred.size(1)
+        if num_pts_per_predline != num_pts_per_gtline:
+            pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline),
+                                            mode='linear', align_corners=True)
+            pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous()
+        else:
+            pts_pred_interpolated = pts_pred
+        # num_q, num_pts, 2 <-> num_gt, num_pts, 2
+        pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts)
+        pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders)
+        pts_cost, order_index = torch.min(pts_cost_ordered, 2)
+        
+        bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range)
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost + pts_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27598e1a959f1443b4f77f8b101a3ca7da49ef2a
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/__init__.py
@@ -0,0 +1 @@
+from .maptr_head import MapTRHead
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..932ac7f686d39db9380f38300f616bde0685a560
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/dense_heads/maptr_head.py
@@ -0,0 +1,772 @@
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import HEADS, build_loss
+from mmdet.models.dense_heads import DETRHead
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmcv.runner import force_fp32, auto_fp16
+from mmcv.cnn import Linear, bias_init_with_prob, xavier_init, constant_init
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy
+from mmdet.core import (multi_apply, multi_apply, reduce_mean)
+from mmcv.utils import TORCH_VERSION, digit_version
+
+def normalize_2d_bbox(bboxes, pc_range):
+
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes)
+    cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0]
+    cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1]
+    factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h])
+
+    normalized_bboxes = cxcywh_bboxes / factor
+    return normalized_bboxes
+
+def normalize_2d_pts(pts, pc_range):
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    new_pts = pts.clone()
+    new_pts[...,0:1] = pts[..., 0:1] - pc_range[0]
+    new_pts[...,1:2] = pts[...,1:2] - pc_range[1]
+    factor = pts.new_tensor([patch_w, patch_h])
+    normalized_pts = new_pts / factor
+    return normalized_pts
+
+def denormalize_2d_bbox(bboxes, pc_range):
+
+    bboxes = bbox_cxcywh_to_xyxy(bboxes)
+    bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+
+    return bboxes
+def denormalize_2d_pts(pts, pc_range):
+    new_pts = pts.clone()
+    new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+    return new_pts
+@HEADS.register_module()
+class MapTRHead(DETRHead):
+    """Head of Detr3D.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 num_vec=20,
+                 num_pts_per_vec=2,
+                 num_pts_per_gt_vec=2,
+                 query_embed_type='all_pts',
+                 transform_method='minmax',
+                 gt_shift_pts_pattern='v0',
+                 dir_interval=1,
+                 loss_pts=dict(type='ChamferDistance', 
+                             loss_src_weight=1.0, 
+                             loss_dst_weight=1.0),
+                 loss_dir=dict(type='PtsDirCosLoss', loss_weight=2.0),
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.bev_encoder_type = transformer.encoder.type
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+        
+
+        self.query_embed_type = query_embed_type
+        self.transform_method = transform_method
+        self.gt_shift_pts_pattern = gt_shift_pts_pattern
+        num_query = num_vec * num_pts_per_vec
+        self.num_query = num_query
+        self.num_vec = num_vec
+        self.num_pts_per_vec = num_pts_per_vec
+        self.num_pts_per_gt_vec = num_pts_per_gt_vec
+        self.dir_interval = dir_interval
+        
+        
+        super(MapTRHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+        self.loss_pts = build_loss(loss_pts)
+        self.loss_dir = build_loss(loss_dir)
+        num_query = num_vec * num_pts_per_vec
+        self.num_query = num_query
+        self.num_vec = num_vec
+        self.num_pts_per_vec = num_pts_per_vec
+        self.num_pts_per_gt_vec = num_pts_per_gt_vec
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        # cls_branch.append(Linear(self.embed_dims * 2, self.embed_dims))
+        # cls_branch.append(nn.LayerNorm(self.embed_dims))
+        # cls_branch.append(nn.ReLU(inplace=True))
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            if self.bev_encoder_type == 'BEVFormerEncoder':
+                self.bev_embedding = nn.Embedding(
+                    self.bev_h * self.bev_w, self.embed_dims)
+            else:
+                self.bev_embedding = None
+            if self.query_embed_type == 'all_pts':
+                self.query_embedding = nn.Embedding(self.num_query,
+                                                    self.embed_dims * 2)
+            elif self.query_embed_type == 'instance_pts':
+                self.query_embedding = None
+                self.instance_embedding = nn.Embedding(self.num_vec, self.embed_dims * 2)
+                self.pts_embedding = nn.Embedding(self.num_pts_per_vec, self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        # for m in self.reg_branches:
+        #     constant_init(m[-1], 0, bias=0)
+        # nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], 0.)
+    
+    # @auto_fp16(apply_to=('mlvl_feats'))
+    @force_fp32(apply_to=('mlvl_feats', 'prev_bev'))
+    def forward(self, mlvl_feats, lidar_feat, img_metas, prev_bev=None,  only_bev=False):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        # import pdb;pdb.set_trace()
+        if self.query_embed_type == 'all_pts':
+            object_query_embeds = self.query_embedding.weight.to(dtype)
+        elif self.query_embed_type == 'instance_pts':
+            pts_embeds = self.pts_embedding.weight.unsqueeze(0)
+            instance_embeds = self.instance_embedding.weight.unsqueeze(1)
+            object_query_embeds = (pts_embeds + instance_embeds).flatten(0, 1).to(dtype)
+        if self.bev_embedding is not None:
+            bev_queries = self.bev_embedding.weight.to(dtype)
+
+            bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                                device=bev_queries.device).to(dtype)
+            bev_pos = self.positional_encoding(bev_mask).to(dtype)
+        else:
+            bev_queries = None
+            bev_mask = None
+            bev_pos = None
+
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                lidar_feat,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                lidar_feat,
+                bev_queries,
+                object_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references = outputs
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_pts_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                # import pdb;pdb.set_trace()
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            # import pdb;pdb.set_trace()
+            # vec_embedding = hs[lvl].reshape(bs, self.num_vec, -1)
+            outputs_class = self.cls_branches[lvl](hs[lvl]
+                                            .view(bs,self.num_vec, self.num_pts_per_vec,-1)
+                                            .mean(2))
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 2
+            tmp[..., 0:2] += reference[..., 0:2]
+            # tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp = tmp.sigmoid() # cx,cy,w,h
+            # import pdb;pdb.set_trace()
+            # tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+            #                  self.pc_range[0]) + self.pc_range[0])
+            # tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+            #                  self.pc_range[1]) + self.pc_range[1])
+            # tmp = tmp.reshape(bs, self.num_vec,-1)
+            # TODO: check if using sigmoid
+            outputs_coord, outputs_pts_coord = self.transform_box(tmp)
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+            outputs_pts_coords.append(outputs_pts_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_pts_coords = torch.stack(outputs_pts_coords)
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'all_pts_preds': outputs_pts_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+            'enc_pts_preds': None
+        }
+
+        return outs
+    def transform_box(self, pts, y_first=False):
+        """
+        Converting the points set into bounding box.
+
+        Args:
+            pts: the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first: if y_fisrt=True, the point set is represented as
+                [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+                represented as [x1, y1, x2, y2 ... xn, yn].
+        Returns:
+            The bbox [cx, cy, w, h] transformed from points.
+        """
+        pts_reshape = pts.view(pts.shape[0], self.num_vec,
+                                self.num_pts_per_vec,2)
+        pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1]
+        pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0]
+        if self.transform_method == 'minmax':
+            # import pdb;pdb.set_trace()
+
+            xmin = pts_x.min(dim=2, keepdim=True)[0]
+            xmax = pts_x.max(dim=2, keepdim=True)[0]
+            ymin = pts_y.min(dim=2, keepdim=True)[0]
+            ymax = pts_y.max(dim=2, keepdim=True)[0]
+            bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2)
+            bbox = bbox_xyxy_to_cxcywh(bbox)
+        else:
+            raise NotImplementedError
+        return bbox, pts_reshape
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           pts_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_shifts_pts,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        # import pdb;pdb.set_trace()
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+        # import pdb;pdb.set_trace()
+        assign_result, order_index = self.assigner.assign(bbox_pred, cls_score, pts_pred,
+                                             gt_bboxes, gt_labels, gt_shifts_pts,
+                                             gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        # pts_sampling_result = self.sampler.sample(assign_result, pts_pred,
+        #                                       gt_pts)
+
+        
+        # import pdb;pdb.set_trace()
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # pts targets
+        # import pdb;pdb.set_trace()
+        # pts_targets = torch.zeros_like(pts_pred)
+        # num_query, num_order, num_points, num_coords
+        if order_index is None:
+            # import pdb;pdb.set_trace()
+            assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds]
+        else:
+            assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds]
+        pts_targets = pts_pred.new_zeros((pts_pred.size(0),
+                        pts_pred.size(1), pts_pred.size(2)))
+        pts_weights = torch.zeros_like(pts_targets)
+        pts_weights[pos_inds] = 1.0
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds,assigned_shift,:,:]
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pts_targets, pts_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    pts_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pts_targets_list, pts_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,pts_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, pts_targets_list, pts_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    pts_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_pts_list (list[Tensor]): Ground truth pts for each image
+                with shape (num_gts, fixed_num, 2) in [x,y] format.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        pts_preds_list = [pts_preds[i] for i in range(num_imgs)]
+        # import pdb;pdb.set_trace()
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,pts_preds_list,
+                                           gt_bboxes_list, gt_labels_list,gt_shifts_pts_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pts_targets_list, pts_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        # import pdb;pdb.set_trace()
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        pts_targets = torch.cat(pts_targets_list, 0)
+        pts_weights = torch.cat(pts_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # import pdb;pdb.set_trace()
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range)
+        # normalized_bbox_targets = bbox_targets
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :4], normalized_bbox_targets[isnotnan,
+                                                               :4], bbox_weights[isnotnan, :4],
+            avg_factor=num_total_pos)
+
+        # regression pts CD loss
+        # pts_preds = pts_preds
+        # import pdb;pdb.set_trace()
+        
+        # num_samples, num_order, num_pts, num_coords
+        normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range)
+
+        # num_samples, num_pts, num_coords
+        pts_preds = pts_preds.reshape(-1, pts_preds.size(-2),pts_preds.size(-1))
+        if self.num_pts_per_vec != self.num_pts_per_gt_vec:
+            pts_preds = pts_preds.permute(0,2,1)
+            pts_preds = F.interpolate(pts_preds, size=(self.num_pts_per_gt_vec), mode='linear',
+                                    align_corners=True)
+            pts_preds = pts_preds.permute(0,2,1).contiguous()
+
+        # import pdb;pdb.set_trace()
+        loss_pts = self.loss_pts(
+            pts_preds[isnotnan,:,:], normalized_pts_targets[isnotnan,
+                                                            :,:], 
+            pts_weights[isnotnan,:,:],
+            avg_factor=num_total_pos)
+        dir_weights = pts_weights[:, :-self.dir_interval,0]
+        denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range)
+        denormed_pts_preds_dir = denormed_pts_preds[:,self.dir_interval:,:] - denormed_pts_preds[:,:-self.dir_interval,:]
+        pts_targets_dir = pts_targets[:, self.dir_interval:,:] - pts_targets[:,:-self.dir_interval,:]
+        # dir_weights = pts_weights[:, indice,:-1,0]
+        # import pdb;pdb.set_trace()
+        loss_dir = self.loss_dir(
+            denormed_pts_preds_dir[isnotnan,:,:], pts_targets_dir[isnotnan,
+                                                                          :,:],
+            dir_weights[isnotnan,:],
+            avg_factor=num_total_pos)
+
+        bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range)
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes[isnotnan, :4], bbox_targets[isnotnan, :4], bbox_weights[isnotnan, :4], 
+            avg_factor=num_total_pos)
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_iou = torch.nan_to_num(loss_iou)
+            loss_pts = torch.nan_to_num(loss_pts)
+            loss_dir = torch.nan_to_num(loss_dir)
+        return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+        gt_vecs_list = copy.deepcopy(gt_bboxes_list)
+        # import pdb;pdb.set_trace()
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        all_pts_preds  = preds_dicts['all_pts_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        enc_pts_preds  = preds_dicts['enc_pts_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        # gt_bboxes_list = [torch.cat(
+        #     (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+        #     dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+        # import pdb;pdb.set_trace()
+        # gt_bboxes_list = [
+        #     gt_bboxes.to(device) for gt_bboxes in gt_bboxes_list]
+        gt_bboxes_list = [
+            gt_bboxes.bbox.to(device) for gt_bboxes in gt_vecs_list]
+        # gt_pts_list = [
+        #     gt_bboxes.fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list] #3
+        if self.gt_shift_pts_pattern == 'v0':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list]
+        elif self.gt_shift_pts_pattern == 'v1':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in gt_vecs_list]
+        elif self.gt_shift_pts_pattern == 'v2':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in gt_vecs_list]
+        elif self.gt_shift_pts_pattern == 'v3':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in gt_vecs_list]
+        elif self.gt_shift_pts_pattern == 'v4':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in gt_vecs_list]
+        elif self.gt_shift_pts_pattern == 'v5':
+            gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v5.to(device) for gt_bboxes in gt_vecs_list] #3
+        else:
+            raise NotImplementedError
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        # all_gt_pts_list = [gt_pts_list for _ in range(num_dec_layers)] #3
+        all_gt_shifts_pts_list = [gt_shifts_pts_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        # import pdb;pdb.set_trace()
+        losses_cls, losses_bbox, losses_iou, losses_pts, losses_dir = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,all_pts_preds,
+            all_gt_bboxes_list, all_gt_labels_list,all_gt_shifts_pts_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            gt_pts_list = [
+                gt_bboxes.fixed_num_sampled_points.to(device) for gt_bboxes in gt_vecs_list] #3
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            # TODO bug here
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou, enc_losses_pts, enc_losses_dir = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds, enc_pts_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_pts_list,gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_losses_iou'] = enc_losses_iou
+            loss_dict['enc_losses_pts'] = enc_losses_pts
+            loss_dict['enc_losses_dir'] = enc_losses_dir
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        loss_dict['loss_pts'] = losses_pts[-1]
+        loss_dict['loss_dir'] = losses_dir[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i, loss_pts_i, loss_dir_i in zip(losses_cls[:-1],
+                                           losses_bbox[:-1],
+                                           losses_iou[:-1],
+                                           losses_pts[:-1],
+                                           losses_dir[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            loss_dict[f'd{num_dec_layer}.loss_pts'] = loss_pts_i
+            loss_dict[f'd{num_dec_layer}.loss_dir'] = loss_dir_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+        # bboxes: xmin, ymin, xmax, ymax
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+            # bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+
+            # code_size = bboxes.shape[-1]
+            # bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+            pts = preds['pts']
+
+            ret_list.append([bboxes, scores, labels, pts])
+
+        return ret_list
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eade3c210c6bcb2d3bc3da78a684ced2be285b0a
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/__init__.py
@@ -0,0 +1 @@
+from .maptr import MapTR
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c328bb8898f27ded186b044152be22f60271bac
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/detectors/maptr.py
@@ -0,0 +1,445 @@
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet3d.ops import Voxelization, DynamicScatter
+from mmdet3d.models import builder
+@DETECTORS.register_module()
+class MapTR(MVXTwoStageDetector):
+    """MapTR.
+    Args:
+        video_test_mode (bool): Decide whether to use temporal information during inference.
+    """
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False,
+                 modality='vision',
+                 lidar_encoder=None,
+                 ):
+
+        super(MapTR,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+        self.modality = modality
+        if self.modality == 'fusion' and lidar_encoder is not None :
+            if lidar_encoder["voxelize"].get("max_num_points", -1) > 0:
+                voxelize_module = Voxelization(**lidar_encoder["voxelize"])
+            else:
+                voxelize_module = DynamicScatter(**lidar_encoder["voxelize"])
+            self.lidar_modal_extractor = nn.ModuleDict(
+                {
+                    "voxelize": voxelize_module,
+                    "backbone": builder.build_middle_encoder(lidar_encoder["backbone"]),
+                }
+            )
+            self.voxelize_reduce = lidar_encoder.get("voxelize_reduce", True)
+
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          lidar_feat,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          prev_bev=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(
+            pts_feats, lidar_feat, img_metas, prev_bev)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                if not img_metas[0]['prev_bev_exists']:
+                    prev_bev = None
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, None, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        feats, coords, sizes = [], [], []
+        for k, res in enumerate(points):
+            ret = self.lidar_modal_extractor["voxelize"](res)
+            if len(ret) == 3:
+                # hard voxelize
+                f, c, n = ret
+            else:
+                assert len(ret) == 2
+                f, c = ret
+                n = None
+            feats.append(f)
+            coords.append(F.pad(c, (1, 0), mode="constant", value=k))
+            if n is not None:
+                sizes.append(n)
+
+        feats = torch.cat(feats, dim=0)
+        coords = torch.cat(coords, dim=0)
+        if len(sizes) > 0:
+            sizes = torch.cat(sizes, dim=0)
+            if self.voxelize_reduce:
+                feats = feats.sum(dim=1, keepdim=False) / sizes.type_as(feats).view(
+                    -1, 1
+                )
+                feats = feats.contiguous()
+
+        return feats, coords, sizes
+    @auto_fp16(apply_to=('points'), out_fp32=True)
+    def extract_lidar_feat(self,points):
+        feats, coords, sizes = self.voxelize(points)
+        # voxel_features = self.lidar_modal_extractor["voxel_encoder"](feats, sizes, coords)
+        batch_size = coords[-1, 0] + 1
+        lidar_feat = self.lidar_modal_extractor["backbone"](feats, coords, batch_size, sizes=sizes)
+        
+        return lidar_feat
+
+    # @auto_fp16(apply_to=('img', 'points'))
+    @force_fp32(apply_to=('img','points','prev_bev'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        lidar_feat = None
+        if self.modality == 'fusion':
+            lidar_feat = self.extract_lidar_feat(points)
+        
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        # prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+        # import pdb;pdb.set_trace()
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) if len_queue>1 else None
+        
+        img_metas = [each[len_queue-1] for each in img_metas]
+        if not img_metas[0]['prev_bev_exists']:
+            prev_bev = None
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, lidar_feat, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(self, img_metas, img=None,points=None,  **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+        points = [points] if points is None else points
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas[0], img[0], points[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs)
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+        return bbox_results
+
+    def pred2result(self, bboxes, scores, labels, pts, attrs=None):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+            labels (torch.Tensor): Labels with shape of (n, ).
+            scores (torch.Tensor): Scores with shape of (n, ).
+            attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+                Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+                - boxes_3d (torch.Tensor): 3D boxes.
+                - scores (torch.Tensor): Prediction scores.
+                - labels_3d (torch.Tensor): Box labels.
+                - attrs_3d (torch.Tensor, optional): Box attributes.
+        """
+        result_dict = dict(
+            boxes_3d=bboxes.to('cpu'),
+            scores_3d=scores.cpu(),
+            labels_3d=labels.cpu(),
+            pts_3d=pts.to('cpu'))
+
+        if attrs is not None:
+            result_dict['attrs_3d'] = attrs.cpu()
+
+        return result_dict
+    def simple_test_pts(self, x, lidar_feat, img_metas, prev_bev=None, rescale=False):
+        """Test function"""
+        outs = self.pts_bbox_head(x, lidar_feat, img_metas, prev_bev=prev_bev)
+
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        
+        bbox_results = [
+            self.pred2result(bboxes, scores, labels, pts)
+            for bboxes, scores, labels, pts in bbox_list
+        ]
+        # import pdb;pdb.set_trace()
+        return outs['bev_embed'], bbox_results
+    def simple_test(self, img_metas, img=None, points=None, prev_bev=None, rescale=False, **kwargs):
+        """Test function without augmentaiton."""
+        lidar_feat = None
+        if self.modality =='fusion':
+            lidar_feat = self.extract_lidar_feat(points)
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts = self.simple_test_pts(
+            img_feats, lidar_feat, img_metas, prev_bev, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return new_prev_bev, bbox_list
+
+
+@DETECTORS.register_module()
+class MapTR_fp16(MapTR):
+    """
+    The default version BEVFormer currently can not support FP16. 
+    We provide this version to resolve this issue.
+    """
+    # @auto_fp16(apply_to=('img', 'prev_bev', 'points'))
+    @force_fp32(apply_to=('img','points','prev_bev'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      prev_bev=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        # import pdb;pdb.set_trace()
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev=prev_bev)
+        losses.update(losses_pts)
+        return losses
+
+
+    def val_step(self, data, optimizer):
+        """
+        In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`.
+        This is not the standard function of `val_step`.
+        """
+
+        img = data['img']
+        img_metas = data['img_metas']
+        img_feats = self.extract_feat(img=img,  img_metas=img_metas)
+        prev_bev = data.get('prev_bev', None)
+        prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True)
+        return prev_bev
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b4dba2aaf4436ae886c65df2ee8743ad66628b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/__init__.py
@@ -0,0 +1,6 @@
+from .map_loss import MyChamferDistance
+from .map_loss import MyChamferDistanceCost
+from .map_loss import OrderedPtsL1Cost, PtsL1Cost
+from .map_loss import OrderedPtsL1Loss, PtsL1Loss
+from .map_loss import OrderedPtsSmoothL1Cost, OrderedPtsL1Loss
+from .map_loss import PtsDirCosLoss
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed18fc58734de15957235443621d26c7d7785dcd
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/losses/map_loss.py
@@ -0,0 +1,718 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
+
+from mmdet.models.builder import LOSSES
+from mmdet.models import weighted_loss
+import mmcv
+import torch.nn.functional as F
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+import functools
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+@mmcv.jit(derivate=True, coderize=True)
+def custom_weight_dir_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_dir
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            # loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum()
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+def custom_weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_order, num_pts, num_coords
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum((1,2,3))
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def custom_weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+
+def custom_weighted_dir_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_dir_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_loss
+def ordered_pts_smooth_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss =smooth_l1_loss(pred,target, reduction='none')
+    # import pdb;pdb.set_trace()
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_loss
+def ordered_pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_dir_loss
+def pts_dir_cos_loss(pred, target):
+    """ Dir cosine similiarity loss
+    pred (torch.Tensor): shape [num_samples, num_dir, num_coords]
+    target (torch.Tensor): shape [num_samples, num_dir, num_coords]
+
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    # import pdb;pdb.set_trace()
+    num_samples, num_dir, num_coords = pred.shape
+    loss_func = torch.nn.CosineEmbeddingLoss(reduction='none')
+    tgt_param = target.new_ones((num_samples, num_dir))
+    tgt_param = tgt_param.flatten(0)
+    loss = loss_func(pred.flatten(0,1), target.flatten(0,1), tgt_param)
+    loss = loss.view(num_samples, num_dir)
+    return loss
+
+@LOSSES.register_module()
+class OrderedPtsSmoothL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsSmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_smooth_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class PtsDirCosLoss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsDirCosLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_dir = self.loss_weight * pts_dir_cos_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_dir
+
+
+
+@LOSSES.register_module()
+class PtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@LOSSES.register_module()
+class OrderedPtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+
+
+@MATCH_COST.register_module()
+class OrderedPtsSmoothL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1).unsqueeze(1).repeat(1,num_gts*num_orders,1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1).unsqueeze(0).repeat(bbox_pred.size(0),1,1)
+        # import pdb;pdb.set_trace()
+        bbox_cost = smooth_l1_loss(bbox_pred, gt_bboxes, reduction='none').sum(-1)
+        # bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class PtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.view(num_gts,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class OrderedPtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class MyChamferDistanceCost:
+    def __init__(self, loss_src_weight=1., loss_dst_weight=1.):
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        # self.mode = mode
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def __call__(self, src, dst,src_weight=1.0,dst_weight=1.0,):
+        """
+        pred_pts (Tensor): normed coordinate(x,y), shape (num_q, num_pts_M, 2)
+        gt_pts (Tensor): normed coordinate(x,y), shape (num_gt, num_pts_N, 2)
+        """
+        # criterion_mode = self.mode
+        # if criterion_mode == 'smooth_l1':
+        #     criterion = smooth_l1_loss
+        # elif criterion_mode == 'l1':
+        #     criterion = l1_loss
+        # elif criterion_mode == 'l2':
+        #     criterion = mse_loss
+        # else:
+        #     raise NotImplementedError
+        # import pdb;pdb.set_trace()
+        src_expand = src.unsqueeze(1).repeat(1,dst.shape[0],1,1)
+        dst_expand = dst.unsqueeze(0).repeat(src.shape[0],1,1,1)
+        # src_expand = src.unsqueeze(2).unsqueeze(1).repeat(1,dst.shape[0], 1, dst.shape[1], 1)
+        # dst_expand = dst.unsqueeze(1).unsqueeze(0).repeat(src.shape[0],1, src.shape[1], 1, 1)
+        distance = torch.cdist(src_expand, dst_expand)
+        src2dst_distance = torch.min(distance, dim=3)[0]  # (num_q, num_gt, num_pts_N)
+        dst2src_distance = torch.min(distance, dim=2)[0]  # (num_q, num_gt, num_pts_M)
+        loss_src = (src2dst_distance * src_weight).mean(-1)
+        loss_dst = (dst2src_distance * dst_weight).mean(-1)
+        loss = loss_src*self.loss_src_weight + loss_dst * self.loss_dst_weight
+        return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+def chamfer_distance(src,
+                     dst,
+                     src_weight=1.0,
+                     dst_weight=1.0,
+                    #  criterion_mode='l1',
+                     reduction='mean',
+                     avg_factor=None):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        src (torch.Tensor): Source set with shape [B, N, C] to
+            calculate Chamfer Distance.
+        dst (torch.Tensor): Destination set with shape [B, M, C] to
+            calculate Chamfer Distance.
+        src_weight (torch.Tensor or float): Weight of source loss.
+        dst_weight (torch.Tensor or float): Weight of destination loss.
+        criterion_mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+
+    Returns:
+        tuple: Source and Destination loss with the corresponding indices.
+
+            - loss_src (torch.Tensor): The min distance \
+                from source to destination.
+            - loss_dst (torch.Tensor): The min distance \
+                from destination to source.
+            - indices1 (torch.Tensor): Index the min distance point \
+                for each point in source to destination.
+            - indices2 (torch.Tensor): Index the min distance point \
+                for each point in destination to source.
+    """
+
+    # if criterion_mode == 'smooth_l1':
+    #     criterion = smooth_l1_loss
+    # elif criterion_mode == 'l1':
+    #     criterion = l1_loss
+    # elif criterion_mode == 'l2':
+    #     criterion = mse_loss
+    # else:
+    #     raise NotImplementedError
+
+    # src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
+    # dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
+    # import pdb;pdb.set_trace()
+    distance = torch.cdist(src, dst)
+    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
+    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
+    # import pdb;pdb.set_trace()
+    #TODO this may be wrong for misaligned src_weight, now[N,fixed_num]
+    # should be [N], then view
+    loss_src = (src2dst_distance * src_weight)
+    loss_dst = (dst2src_distance * dst_weight)
+    if avg_factor is None:
+        reduction_enum = F._Reduction.get_enum(reduction)
+        if reduction_enum == 0:
+            raise ValueError('MyCDLoss can not be used with reduction=`none`')
+        elif reduction_enum == 1:
+            loss_src = loss_src.mean(-1).mean()
+            loss_dst = loss_dst.mean(-1).mean()
+        elif reduction_enum == 2:
+            loss_src = loss_src.mean(-1).sum()
+            loss_dst = loss_dst.mean(-1).sum()
+        else:
+            raise NotImplementedError
+    else:
+        if reduction == 'mean':
+            eps = torch.finfo(torch.float32).eps
+            loss_src = loss_src.mean(-1).sum() / (avg_factor + eps)
+            loss_dst = loss_dst.mean(-1).sum() / (avg_factor + eps)
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+    return loss_src, loss_dst, indices1, indices2
+
+
+@LOSSES.register_module()
+class MyChamferDistance(nn.Module):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_src_weight (float): Weight of loss_source.
+        loss_dst_weight (float): Weight of loss_target.
+    """
+
+    def __init__(self,
+                #  mode='l1',
+                 reduction='mean',
+                 loss_src_weight=1.0,
+                 loss_dst_weight=1.0):
+        super(MyChamferDistance, self).__init__()
+
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        assert reduction in ['none', 'sum', 'mean']
+        # self.mode = mode
+        self.reduction = reduction
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def forward(self,
+                source,
+                target,
+                src_weight=1.0,
+                dst_weight=1.0,
+                avg_factor=None,
+                reduction_override=None,
+                return_indices=False,
+                **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            source (torch.Tensor): Source set with shape [B, N, C] to
+                calculate Chamfer Distance.
+            target (torch.Tensor): Destination set with shape [B, M, C] to
+                calculate Chamfer Distance.
+            src_weight (torch.Tensor | float, optional):
+                Weight of source loss. Defaults to 1.0.
+            dst_weight (torch.Tensor | float, optional):
+                Weight of destination loss. Defaults to 1.0.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+            return_indices (bool, optional): Whether to return indices.
+                Defaults to False.
+
+        Returns:
+            tuple[torch.Tensor]: If ``return_indices=True``, return losses of \
+                source and target with their corresponding indices in the \
+                order of ``(loss_source, loss_target, indices1, indices2)``. \
+                If ``return_indices=False``, return \
+                ``(loss_source, loss_target)``.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_source, loss_target, indices1, indices2 = chamfer_distance(
+            source, target, src_weight, dst_weight, reduction,
+            avg_factor=avg_factor)
+
+        loss_source *= self.loss_src_weight
+        loss_target *= self.loss_dst_weight
+
+        loss_pts = loss_source + loss_target
+
+        if return_indices:
+            return loss_pts, indices1, indices2
+        else:
+            return loss_pts
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a4c1d2f64673308e960a336c4fb128c09477aa
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/__init__.py
@@ -0,0 +1,5 @@
+from .transformer import MapTRPerceptionTransformer
+from .decoder import MapTRDecoder
+# from .geometry_kernel_attention import GeometrySptialCrossAttention, GeometryKernelAttention #3
+from .builder import build_fuser
+# from .encoder import LSSTransform #3
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f9b41592053e1e1bde5be289bf0584d45f7db6
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/builder.py
@@ -0,0 +1,5 @@
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+FUSERS = Registry("fusers")
+def build_fuser(cfg):
+    return FUSERS.build(cfg)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23e470a7ec6dc5fdf6f2b951c7caaa4c9da650d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/decoder.py
@@ -0,0 +1,84 @@
+import torch
+from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmdet.models.utils.transformer import inverse_sigmoid
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class MapTRDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(MapTRDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 2
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                # new_reference_points[..., 2:3] = tmp[
+                #     ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dca1da112d72dc7a6c13eb00211d64e2c1e9da
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/encoder.py
@@ -0,0 +1,355 @@
+import torch
+import numpy as np
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+import torch.nn as nn
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmdet3d.ops import bev_pool
+from mmcv.runner import force_fp32, auto_fp16
+
+def gen_dx_bx(xbound, ybound, zbound):
+    dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
+    bx = torch.Tensor([row[0] + row[2] / 2.0 for row in [xbound, ybound, zbound]])
+    nx = torch.Tensor(
+        [int((row[1] - row[0]) / row[2]) for row in [xbound, ybound, zbound]]
+    )
+    return dx, bx, nx
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class BaseTransform(BaseModule):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        feat_down_sample,
+        pc_range,
+        voxel_size,
+        dbound,
+    ):
+        super(BaseTransform, self).__init__()
+        self.in_channels = in_channels
+        self.feat_down_sample = feat_down_sample
+        # self.image_size = image_size
+        # self.feature_size = feature_size
+        self.xbound = [pc_range[0],pc_range[3], voxel_size[0]]
+        self.ybound = [pc_range[1],pc_range[4], voxel_size[1]]
+        self.zbound = [pc_range[2],pc_range[5], voxel_size[2]]
+        self.dbound = dbound
+
+        dx, bx, nx = gen_dx_bx(self.xbound, self.ybound, self.zbound)
+        self.dx = nn.Parameter(dx, requires_grad=False)
+        self.bx = nn.Parameter(bx, requires_grad=False)
+        self.nx = nn.Parameter(nx, requires_grad=False)
+
+        self.C = out_channels
+        self.frustum = None
+        self.D = int((dbound[1] - dbound[0]) / dbound[2])
+        # self.frustum = self.create_frustum()
+        # self.D = self.frustum.shape[0]
+        self.fp16_enabled = False
+
+    @force_fp32()
+    def create_frustum(self,fH,fW,img_metas):
+        # iH, iW = self.image_size
+        # fH, fW = self.feature_size
+        iH = img_metas[0]['img_shape'][0][0]
+        iW = img_metas[0]['img_shape'][0][1]
+        assert iH // self.feat_down_sample == fH
+        # import pdb;pdb.set_trace()
+        ds = (
+            torch.arange(*self.dbound, dtype=torch.float)
+            .view(-1, 1, 1)
+            .expand(-1, fH, fW)
+        )
+        D, _, _ = ds.shape
+
+        xs = (
+            torch.linspace(0, iW - 1, fW, dtype=torch.float)
+            .view(1, 1, fW)
+            .expand(D, fH, fW)
+        )
+        ys = (
+            torch.linspace(0, iH - 1, fH, dtype=torch.float)
+            .view(1, fH, 1)
+            .expand(D, fH, fW)
+        )
+
+        frustum = torch.stack((xs, ys, ds), -1)
+        # return nn.Parameter(frustum, requires_grad=False)
+        return frustum
+    @force_fp32()
+    def get_geometry_v1(
+        self,
+        fH,
+        fW,
+        rots,
+        trans,
+        intrins,
+        post_rots,
+        post_trans,
+        lidar2ego_rots,
+        lidar2ego_trans,
+        img_metas,
+        **kwargs,
+    ):
+        B, N, _ = trans.shape
+        device = trans.device
+        if self.frustum == None:
+            self.frustum = self.create_frustum(fH,fW,img_metas)
+            self.frustum = self.frustum.to(device)
+            # self.D = self.frustum.shape[0]
+        
+        # undo post-transformation
+        # B x N x D x H x W x 3
+        points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
+        points = (
+            torch.inverse(post_rots)
+            .view(B, N, 1, 1, 1, 3, 3)
+            .matmul(points.unsqueeze(-1))
+        )
+        # cam_to_ego
+        points = torch.cat(
+            (
+                points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
+                points[:, :, :, :, :, 2:3],
+            ),
+            5,
+        )
+        combine = rots.matmul(torch.inverse(intrins))
+        points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
+        points += trans.view(B, N, 1, 1, 1, 3)
+        # ego_to_lidar
+        points -= lidar2ego_trans.view(B, 1, 1, 1, 1, 3)
+        points = (
+            torch.inverse(lidar2ego_rots)
+            .view(B, 1, 1, 1, 1, 3, 3)
+            .matmul(points.unsqueeze(-1))
+            .squeeze(-1)
+        )
+
+        if "extra_rots" in kwargs:
+            extra_rots = kwargs["extra_rots"]
+            points = (
+                extra_rots.view(B, 1, 1, 1, 1, 3, 3)
+                .repeat(1, N, 1, 1, 1, 1, 1)
+                .matmul(points.unsqueeze(-1))
+                .squeeze(-1)
+            )
+        if "extra_trans" in kwargs:
+            extra_trans = kwargs["extra_trans"]
+            points += extra_trans.view(B, 1, 1, 1, 1, 3).repeat(1, N, 1, 1, 1, 1)
+
+        return points
+
+    @force_fp32()
+    def get_geometry(
+        self,
+        fH,
+        fW,
+        lidar2img,
+        img_metas,
+    ):
+        B, N, _, _ = lidar2img.shape
+        device = lidar2img.device
+        # import pdb;pdb.set_trace()
+        if self.frustum == None:
+            self.frustum = self.create_frustum(fH,fW,img_metas)
+            self.frustum = self.frustum.to(device)
+            # self.D = self.frustum.shape[0]
+        
+        points = self.frustum.view(1,1,self.D, fH, fW, 3) \
+                 .repeat(B,N,1,1,1,1)
+        lidar2img = lidar2img.view(B,N,1,1,1,4,4)
+        # img2lidar = torch.inverse(lidar2img)
+        points = torch.cat(
+            (points, torch.ones_like(points[..., :1])), -1)
+        points = torch.linalg.solve(lidar2img.to(torch.float32), 
+                                    points.unsqueeze(-1).to(torch.float32)).squeeze(-1)
+        # points = torch.matmul(img2lidar.to(torch.float32),
+        #                       points.unsqueeze(-1).to(torch.float32)).squeeze(-1)
+        # import pdb;pdb.set_trace()
+        eps = 1e-5
+        points = points[..., 0:3] / torch.maximum(
+            points[..., 3:4], torch.ones_like(points[..., 3:4]) * eps)
+
+        return points
+
+    def get_cam_feats(self, x):
+        raise NotImplementedError
+
+    @force_fp32()
+    def bev_pool(self, geom_feats, x):
+        B, N, D, H, W, C = x.shape
+        Nprime = B * N * D * H * W
+
+        # flatten x
+        x = x.reshape(Nprime, C)
+
+        # flatten indices
+        geom_feats = ((geom_feats - (self.bx - self.dx / 2.0)) / self.dx).long()
+        geom_feats = geom_feats.view(Nprime, 3)
+        batch_ix = torch.cat(
+            [
+                torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long)
+                for ix in range(B)
+            ]
+        )
+        geom_feats = torch.cat((geom_feats, batch_ix), 1)
+
+        # filter out points that are outside box
+        kept = (
+            (geom_feats[:, 0] >= 0)
+            & (geom_feats[:, 0] < self.nx[0])
+            & (geom_feats[:, 1] >= 0)
+            & (geom_feats[:, 1] < self.nx[1])
+            & (geom_feats[:, 2] >= 0)
+            & (geom_feats[:, 2] < self.nx[2])
+        )
+        x = x[kept]
+        geom_feats = geom_feats[kept]
+
+        x = bev_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1])
+
+        # collapse Z
+        final = torch.cat(x.unbind(dim=2), 1)
+
+        return final
+
+    @force_fp32()
+    def forward(
+        self,
+        images,
+        img_metas
+    ):
+        B, N, C, fH, fW = images.shape
+        lidar2img = []
+        camera2ego = []
+        camera_intrinsics = []
+        img_aug_matrix = []
+        lidar2ego = []
+
+        for img_meta in img_metas:
+            lidar2img.append(img_meta['lidar2img'])
+            camera2ego.append(img_meta['camera2ego'])
+            camera_intrinsics.append(img_meta['camera_intrinsics'])
+            img_aug_matrix.append(img_meta['img_aug_matrix'])
+            lidar2ego.append(img_meta['lidar2ego'])
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = images.new_tensor(lidar2img)  # (B, N, 4, 4)
+        camera2ego = np.asarray(camera2ego)
+        camera2ego = images.new_tensor(camera2ego)  # (B, N, 4, 4)
+        camera_intrinsics = np.asarray(camera_intrinsics)
+        camera_intrinsics = images.new_tensor(camera_intrinsics)  # (B, N, 4, 4)
+        img_aug_matrix = np.asarray(img_aug_matrix)
+        img_aug_matrix = images.new_tensor(img_aug_matrix)  # (B, N, 4, 4)
+        lidar2ego = np.asarray(lidar2ego)
+        lidar2ego = images.new_tensor(lidar2ego)  # (B, N, 4, 4)
+
+        # import pdb;pdb.set_trace()
+        # lidar2cam = torch.linalg.solve(camera2ego, lidar2ego.view(B,1,4,4).repeat(1,N,1,1))
+        # lidar2oriimg = torch.matmul(camera_intrinsics,lidar2cam)
+        # mylidar2img = torch.matmul(img_aug_matrix,lidar2oriimg)
+
+
+
+        rots = camera2ego[..., :3, :3]
+        trans = camera2ego[..., :3, 3]
+        intrins = camera_intrinsics[..., :3, :3]
+        post_rots = img_aug_matrix[..., :3, :3]
+        post_trans = img_aug_matrix[..., :3, 3]
+        lidar2ego_rots = lidar2ego[..., :3, :3]
+        lidar2ego_trans = lidar2ego[..., :3, 3]
+
+        # tmpgeom = self.get_geometry(
+        #     fH,
+        #     fW,
+        #     mylidar2img,
+        #     img_metas,
+        # )
+
+        geom = self.get_geometry_v1(
+            fH,
+            fW,
+            rots,
+            trans,
+            intrins,
+            post_rots,
+            post_trans,
+            lidar2ego_rots,
+            lidar2ego_trans,
+            img_metas
+        )
+        x = self.get_cam_feats(images)
+        x = self.bev_pool(geom, x)
+        # import pdb;pdb.set_trace()
+        x = x.permute(0,1,3,2).contiguous()
+        
+        return x
+
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class LSSTransform(BaseTransform):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        feat_down_sample,
+        pc_range,
+        voxel_size,
+        dbound,
+        downsample=1,
+    ):
+        super(LSSTransform, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            feat_down_sample=feat_down_sample,
+            pc_range=pc_range,
+            voxel_size=voxel_size,
+            dbound=dbound,
+        )
+        # import pdb;pdb.set_trace()
+        self.depthnet = nn.Conv2d(in_channels, int(self.D + self.C), 1)
+        if downsample > 1:
+            assert downsample == 2, downsample
+            self.downsample = nn.Sequential(
+                nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=downsample,
+                    padding=1,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+            )
+        else:
+            self.downsample = nn.Identity()
+
+    @force_fp32()
+    def get_cam_feats(self, x):
+        B, N, C, fH, fW = x.shape
+
+        x = x.view(B * N, C, fH, fW)
+
+        x = self.depthnet(x)
+        depth = x[:, : self.D].softmax(dim=1)
+        x = depth.unsqueeze(1) * x[:, self.D : (self.D + self.C)].unsqueeze(2)
+
+        x = x.view(B, N, self.C, self.D, fH, fW)
+        x = x.permute(0, 1, 3, 4, 5, 2)
+        return x
+
+    def forward(self, images, img_metas):
+        x = super().forward(images, img_metas)
+        x = self.downsample(x)
+        return x
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..03ba0e7f8e6cbacc7d5cc6abbdde1d542b8e030c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/geometry_kernel_attention.py
@@ -0,0 +1,506 @@
+import warnings
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import build_attention
+import math
+from mmcv.runner import force_fp32, auto_fp16
+
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+from .ops.geometric_kernel_attn import GeometricKernelAttentionFunc
+
+@ATTENTION.register_module()
+class GeometrySptialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(GeometrySptialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.attention = build_attention(attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(
+                    index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(
+                    index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                 reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                 level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j,
+                                                         i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class GeometryKernelAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 kernel_size=(3, 3),
+                 dilation=1,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        # 4
+        self.num_levels = num_levels
+        # 4 num_heads -> num_z_anchors
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.num_points = kernel_size[0] * kernel_size[1]
+        # self.sampling_offsets = nn.Linear(
+        #     embed_dims, num_heads * num_levels * self.num_points * 2)
+
+        self.attention_weights = nn.Linear(
+            embed_dims, num_levels * self.num_points * self.num_heads)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        grid_h, grid_w = kernel_size
+        y = (torch.arange(grid_h) - grid_h // 2) * dilation
+        x = (torch.arange(grid_w) - grid_w // 2) * dilation
+        offsets = torch.stack(
+            torch.meshgrid(x, y)).permute(1, 2, 0).reshape(grid_h * grid_w, 2)
+        self.register_buffer("grid_offsets", offsets, persistent=False)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        # constant_init(self.sampling_offsets, 0.)
+        # thetas = torch.arange(
+        #     self.num_heads,
+        #     dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        # grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        # grid_init = (grid_init /
+        #              grid_init.abs().max(-1, keepdim=True)[0]).view(
+        #     self.num_heads, 1, 1,
+        #     2).repeat(1, self.num_levels, self.num_points, 1)
+        # for i in range(self.num_points):
+        #     grid_init[:, :, i, :] *= i + 1
+
+        # self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward_kernel_multihead_attention(self, value, spatial_shapes, sampling_locations, attention_weights):
+        # value: (bs, n, d)
+        """CPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, dim)
+            spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_levels, num_points),
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        # print(value.shape, sampling_locations.shape, attention_weights.shape)
+        # print(value.shape)
+        bs, num_keys, num_heads, dim = value.shape
+        # (bs * num_heads * num_keys, d)
+        # torch.cuda.synchronize()
+        # start2 = time.perf_counter()
+        value = value.transpose(1, 2).contiguous().view(
+            bs * num_heads * num_keys, dim)
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        with torch.no_grad():
+            sampling_index = sampling_locations.new_zeros(
+                (bs, num_queries, num_heads, num_levels, num_points)).to(value.device)
+            start_index = 0
+            for level, (H_, W_) in enumerate(spatial_shapes):
+                # xy or yx?
+                sampling_locations[:, :, :, level,
+                                   :, 0].clamp_(min=0, max=W_-1)
+                sampling_locations[:, :, :, level,
+                                   :, 1].clamp_(min=0, max=H_-1)
+                sampling_index[:, :, :, level] = start_index + sampling_locations[:, :, :, level, :, 0] \
+                    + sampling_locations[:, :, :, level, :, 1] * W_
+                start_index += H_ * W_
+            # print(start_index)
+            # head index, (bs, head, num_quries,)
+            sampling_index = sampling_index.transpose(
+                1, 2).reshape(bs, num_heads, -1)
+            sampling_index = sampling_index + \
+                (torch.arange(num_heads).to(sampling_index)
+                 * num_keys).view(1, num_heads, 1)
+            # batch index
+            sampling_index = sampling_index.reshape(
+                bs, -1) + (torch.arange(bs).to(sampling_index) * num_keys * num_heads).view(bs, 1)
+        # torch.cuda.synchronize()
+        # end = time.perf_counter()
+        # print("geometric kernel attention (index): {:.3f} ms".format(
+        #     (end-start)*1000))
+        # torch.cuda.synchronize()
+        # start = time.perf_counter()
+        sampling_value = value[sampling_index].view(
+            bs, num_heads, num_queries, num_levels * num_points, dim)
+        # print(sampling_value.shape)
+        attention_weights = attention_weights.transpose(1, 2).contiguous().view(
+            bs, num_heads, num_queries, num_levels * num_points, 1)
+        # torch.cuda.synchronize()
+        # end = time.perf_counter()
+        # print("geometric kernel attention (sample): {:.3f} ms".format(
+        #     (end-start)*1000))
+        # # (bs*head, num_queries, num_levels * num_points, d) -> (bs, head, num_queries, d)
+        # torch.cuda.synchronize()
+        # start = time.perf_counter()
+        output = (sampling_value *
+                  attention_weights).sum(-2).transpose(1, 2).contiguous()
+        # torch.cuda.synchronize()
+        # end = time.perf_counter()
+        # print("geometric kernel attention (matmul): {:.3f} ms".format(
+        #     (end-start)*1000))
+        # print('x;', output.shape)
+        return output.view(bs, num_queries, -1)
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        # sampling_offsets = self.sampling_offsets(query).view(
+        #     bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        # bs, num_query, num_heads, num_levels, num_points
+        # bs, q, 4, 4, K^2
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            with torch.no_grad():
+                offset_normalizer = torch.stack(
+                    [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+                bs, num_query, num_Z_anchors, xy = reference_points.shape
+                # from IPython import embed; embed()
+                # (K,2) -> (1, 1, 1, 1, k, 2) -> (bs, q, nz, l, k, 2)
+                offsets = self.grid_offsets[None, None, None, None]
+                # (bs, q, nz, 1, xy) -> (bs, q, z, l, 2)
+                reference_points = reference_points[:,
+                                                    :, :, None, :] * offset_normalizer
+
+                # from IPython import embed;embed()
+                # (bs, q, nz, l, k, xy)
+                sampling_locations = (
+                    reference_points[:, :, :, :, None, :] + offsets).round().long()
+
+            # sampling_offsets = sampling_offsets / \
+            #     offset_normalizer[None, None, None, :, None, :]
+            # (bs, q, 4(z), 4, K^2, 2)
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_locations.shape
+            # sampling_offsets = sampling_offsets.view(
+            #     bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            # sampling_locations = reference_points + sampling_offsets
+            # bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            # assert num_all_points == num_points * num_Z_anchors
+
+            # sampling_locations = sampling_locations.view(
+            #     bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        # import pdb;pdb.set_trace()
+        # output = self.forward_kernel_multihead_attention(
+        #     value, spatial_shapes, sampling_locations, attention_weights)
+        # torch.cuda.synchronize()
+        # start = time.perf_counter()
+        output = GeometricKernelAttentionFunc.apply(
+            value, spatial_shapes, level_start_index, sampling_locations.contiguous(), attention_weights, self.im2col_step
+        )
+        # if torch.cuda.is_available() and value.is_cuda:
+        #     if value.dtype == torch.float16:
+        #         MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+        #     else:
+        #         MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+        #     output = MultiScaleDeformableAttnFunction.apply(
+        #         value, spatial_shapes, level_start_index, sampling_locations,
+        #         attention_weights, self.im2col_step)
+        # else:
+        #     output = multi_scale_deformable_attn_pytorch(
+        #         value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+        # torch.cuda.synchronize()
+        # end = time.perf_counter()
+        # print("geometric kernel attention: {:.3f} ms".format((end-start)*1000))
+        return output
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7578beb75c41d3a145992e9ef3663c88fd556d6d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/__init__.py
@@ -0,0 +1 @@
+from .function import GeometricKernelAttentionFunc
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ee42aa17f6d5aa51304f91557d2c6a39c28a5ea
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/__init__.py
@@ -0,0 +1 @@
+from .geometric_kernel_attn_func import GeometricKernelAttentionFunc
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c44a7e3108a18fa997095c03b000a770e5ba770
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/function/geometric_kernel_attn_func.py
@@ -0,0 +1,31 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+import GeometricKernelAttention as GKA
+
+
+class GeometricKernelAttentionFunc(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = GKA.geometric_kernel_attn_cuda_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_attn_weight = \
+            GKA.geometric_kernel_attn_cuda_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, None, grad_attn_weight, None
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5891640b18f8c5ea50f266ca6e63f1b0d1f99c
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/setup.py
@@ -0,0 +1,65 @@
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    # source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
+
+    sources = main_file
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "GeometricKernelAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+
+setup(
+    name="GeometricKernelAttention",
+    version="1.0",
+    author="Tianheng Cheng",
+    url="https://github.com/hustvl",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Geometric Kernel Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c53ff56a9654d23aecd1761546ce4f823bfea21d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/src/version.cpp
@@ -0,0 +1,7 @@
+#include "geometric_kernel_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("geometric_kernel_attn_cuda_forward", &geometric_kernel_attn_cuda_forward, "geometric_kernel_attn_cuda_forward");
+  m.def("geometric_kernel_attn_cuda_backward", &geometric_kernel_attn_cuda_backward, "geometric_kernel_attn_cuda_backward");
+}
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/test.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/ops/geometric_kernel_attn/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dc31c7fb4ed65113483b20a2662d2a2382bc8c8
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/maptr/modules/transformer.py
@@ -0,0 +1,355 @@
+import copy
+import torch
+import torch.nn as nn
+import numpy as np
+from torch.nn.init import normal_
+import torch.nn.functional as F
+from mmdet.models.utils.builder import TRANSFORMER
+from mmcv.cnn import Linear, bias_init_with_prob, xavier_init, constant_init
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from torchvision.transforms.functional import rotate
+from projects.mmdet3d_plugin.bevformer.modules.temporal_self_attention import TemporalSelfAttention
+from projects.mmdet3d_plugin.bevformer.modules.spatial_cross_attention import MSDeformableAttention3D
+from projects.mmdet3d_plugin.bevformer.modules.decoder import CustomMSDeformableAttention
+from .builder import build_fuser, FUSERS
+from typing import List
+
+@FUSERS.register_module()
+class ConvFuser(nn.Sequential):
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        super().__init__(
+            nn.Conv2d(sum(in_channels), out_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(True),
+        )
+
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
+        return super().forward(torch.cat(inputs, dim=1))
+
+
+
+@TRANSFORMER.register_module()
+class MapTRPerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 fuser=None,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 len_can_bus=18,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 modality='vision',
+                 **kwargs):
+        super(MapTRPerceptionTransformer, self).__init__(**kwargs)
+        if modality == 'fusion':
+            self.fuser = build_fuser(fuser) #TODO
+        self.use_attn_bev = encoder['type'] == 'BEVFormerEncoder'
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.len_can_bus = len_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 2) # TODO, this is a hack
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(self.len_can_bus, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'), out_fp32=True)
+    def attn_bev_encode(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=(0.512, 0.512),
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus[:, :self.len_can_bus])[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+        return bev_embed
+
+    def lss_bev_encode(
+            self,
+            mlvl_feats,
+            prev_bev=None,
+            **kwargs):
+        assert len(mlvl_feats) == 1, 'Currently we only support single level feat in LSS'
+        images = mlvl_feats[0]
+        img_metas = kwargs['img_metas']
+        bev_embed = self.encoder(images,img_metas)
+        bs, c, _,_ = bev_embed.shape
+        bev_embed = bev_embed.view(bs,c,-1).permute(0,2,1).contiguous()
+        
+        return bev_embed
+
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            lidar_feat,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+        if self.use_attn_bev:
+            bev_embed = self.attn_bev_encode(
+                mlvl_feats,
+                bev_queries,
+                bev_h,
+                bev_w,
+                grid_length=grid_length,
+                bev_pos=bev_pos,
+                prev_bev=prev_bev,
+                **kwargs)
+        else:
+            bev_embed = self.lss_bev_encode(
+                mlvl_feats,
+                prev_bev=prev_bev,
+                **kwargs)
+        if lidar_feat is not None:
+            bs = mlvl_feats[0].size(0)
+            bev_embed = bev_embed.view(bs, bev_h, bev_w, -1).permute(0,3,1,2).contiguous()
+            lidar_feat = lidar_feat.permute(0,1,3,2).contiguous() # B C H W
+            lidar_feat = nn.functional.interpolate(lidar_feat, size=(bev_h,bev_w), mode='bicubic', align_corners=False)
+            fused_bev = self.fuser([bev_embed, lidar_feat])
+            fused_bev = fused_bev.flatten(2).permute(0,2,1).contiguous()
+            bev_embed = fused_bev
+
+        return bev_embed
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                lidar_feat,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            lidar_feat,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9169d02a0cfb21e91c61431109b4d9ae304f1d
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/__init__.py
@@ -0,0 +1,4 @@
+from .vovnet import VoVNet
+from .efficientnet import EfficientNet
+from .swin import SwinTransformer
+__all__ = ['VoVNet']
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..82556ece1cd064f8bb65ffa2cb8ca4902eb6a108
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/efficientnet.py
@@ -0,0 +1,415 @@
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmcv.runner import BaseModule, Sequential
+
+from mmdet.models.builder import BACKBONES
+from ..utils import InvertedResidual, SELayer, make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@BACKBONES.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmcls.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmcls.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmcls.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        # import pdb;pdb.set_trace()
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..166950599a35a5ea92cfb1f3aeaacea527929e0f
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/swin.py
@@ -0,0 +1,825 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer, trunc_normal_init
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmcv.cnn.utils.weight_init import constant_init
+from mmcv.runner import _load_checkpoint
+from mmcv.runner.base_module import BaseModule, ModuleList
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+import torch.utils.checkpoint as checkpoint
+
+from mmseg.ops import resize
+from mmdet3d.utils import get_root_logger
+from mmdet.models.builder import BACKBONES
+from mmcv.cnn.bricks.registry import ATTENTION
+from ..utils import PatchEmbed, swin_convert
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer use nn.Unfold to group feature map by kernel_size, and use norm
+    and linear layer to embed grouped feature map.
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        stride (int | tuple): the stride of the sliding length in the
+            unfold layer. Defaults: 2. (Default to be equal with kernel_size).
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Defaults: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Defaults: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=2,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+        self.sampler = nn.Unfold(
+            kernel_size=stride, dilation=1, padding=0, stride=stride)
+
+        sample_dim = stride**2 * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, hw_shape):
+        """
+        x: x.shape -> [B, H*W, C]
+        hw_shape: (H, W)
+        """
+        B, L, C = x.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        # stride is fixed to be equal to kernel_size.
+        if (H % self.stride != 0) or (W % self.stride != 0):
+            x = F.pad(x, (0, W % self.stride, 0, H % self.stride))
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        x = self.sampler(x)  # B, 4*C, H/2*W/2
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+
+        down_hw_shape = (H + 1) // 2, (W + 1) // 2
+        return x, down_hw_shape
+
+
+@ATTENTION.register_module()
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+        self.init_cfg = init_cfg
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_init(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+@ATTENTION.register_module()
+class ShiftWindowMSA(BaseModule):
+    """Shift Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1),
+                                   device=query.device)  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            # w_slices = (slice(0, -self.window_size),
+            #             slice(-self.window_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            window_size (int): Window size
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+            window_size (int): window size
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window size (int, optional): The local window scale. Default: 7.
+        shift (bool): whether to shift window or not. Default False.
+        qkv_bias (int, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of nomalization.
+            Default: dict(type='LN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__()
+
+        self.init_cfg = init_cfg
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+        self.hw_shape = None
+
+    def forward(self, x):
+        hw_shape = self.hw_shape
+        identity = x
+        x = self.norm1(x)
+        x = self.attn(x, hw_shape)
+
+        x = x + identity
+
+        identity = x
+        x = self.norm2(x)
+        x = self.ffn(x, identity=identity)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window size (int): The local window scale. Default: 7.
+        qkv_bias (int): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of nomalization.
+            Default: dict(type='LN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 with_cp=True):
+        super().__init__()
+
+        self.init_cfg = init_cfg
+
+        drop_path_rate = drop_path_rate if isinstance(
+            drop_path_rate,
+            list) else [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rate[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            block.hw_shape=hw_shape
+            if self.with_cp:
+                x = checkpoint.checkpoint(block, x)
+            else:
+                x = block(x)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@BACKBONES.register_module(force=True)
+class SwinTransformer(BaseModule):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LN').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        pretrain_style (str): Choose to use official or mmcls pretrain weights.
+            Default: official.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 pretrain_style='official',
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=True,
+                 output_missing_index_as_none=False,
+                 frozen_stages=-1):
+        super(SwinTransformer, self).__init__()
+
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert pretrain_style in ['official', 'mmcls'], 'We only support load '
+        'official ckpt and mmcls ckpt.'
+
+        if isinstance(pretrained, str) or pretrained is None:
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+        self.pretrain_style = pretrain_style
+        self.pretrained = pretrained
+        self.init_cfg = init_cfg
+
+        self.frozen_stages = frozen_stages
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            pad_to_patch_size=True,
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]  # stochastic depth decay rule
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=mlp_ratio * in_channels,
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[:depths[i]],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                init_cfg=None,
+                with_cp=with_cp)
+            self.stages.append(stage)
+
+            dpr = dpr[depths[i]:]
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+        self.output_missing_index_as_none = output_missing_index_as_none
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.use_abs_pos_embed:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.drop_after_pos.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.stages[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self):
+        if self.pretrained is None:
+            super().init_weights()
+            if self.use_abs_pos_embed:
+                trunc_normal_init(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, Linear):
+                    trunc_normal_init(m.weight, std=.02)
+                    if m.bias is not None:
+                        constant_init(m.bias, 0)
+                elif isinstance(m, LayerNorm):
+                    constant_init(m.bias, 0)
+                    constant_init(m.weight, 1.0)
+        elif isinstance(self.pretrained, str):
+            logger = get_root_logger()
+            ckpt = _load_checkpoint(
+                self.pretrained, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                state_dict = ckpt['model']
+            else:
+                state_dict = ckpt
+
+            if self.pretrain_style == 'official':
+                state_dict = swin_convert(state_dict)
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                else:
+                    if L1 != L2:
+                        S1 = int(L1**0.5)
+                        S2 = int(L2**0.5)
+                        table_pretrained_resized = resize(
+                            table_pretrained.permute(1, 0).reshape(
+                                1, nH1, S1, S1),
+                            size=(S2, S2),
+                            mode='bicubic')
+                        state_dict[table_key] = table_pretrained_resized.view(
+                            nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+
+        hw_shape = (self.patch_embed.DH, self.patch_embed.DW)
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+            elif self.output_missing_index_as_none:
+                outs.append(None)
+        return outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..879d186a37b49addaf27362cc6ae1e5465b2168e
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/backbones/vovnet.py
@@ -0,0 +1,375 @@
+
+from collections import OrderedDict
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    "dw": False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 2, 2],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 4, 3],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 3, 9, 3],
+    "eSE": True,
+    "dw": False
+}
+
+_STAGE_SPECS = {
+    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
+    "V-19-dw-eSE": VoVNet19_dw_eSE,
+    "V-19-slim-eSE": VoVNet19_slim_eSE,
+    "V-19-eSE": VoVNet19_eSE,
+    "V-39-eSE": VoVNet39_eSE,
+    "V-57-eSE": VoVNet57_eSE,
+    "V-99-eSE": VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            '{}_{}/dw_conv3x3'.format(module_name, postfix),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=out_channels,
+                bias=False
+            )
+        ),
+        (
+            '{}_{}/pw_conv1x1'.format(module_name, postfix),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
+        ),
+        ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False
+    ):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
+            )
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+
+        self.ese = eSEModule(concat_ch)
+
+    def forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
+    ):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f"OSA{stage_num}_1"
+        self.add_module(
+            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
+        )
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f"OSA{stage_num}_{i + 2}"
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise
+                ),
+            )
+
+
+@BACKBONES.register_module()
+class VoVNet(BaseModule):
+    def __init__(self, spec_name, input_ch=3, out_features=None, 
+                 frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs["stem"]
+        config_stage_ch = stage_specs["stage_conv_ch"]
+        config_concat_ch = stage_specs["stage_out_ch"]
+        block_per_stage = stage_specs["block_per_stage"]
+        layer_per_block = stage_specs["layer_per_block"]
+        SE = stage_specs["eSE"]
+        depthwise = stage_specs["dw"]
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
+        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
+        self._out_feature_channels = {"stem": stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+
+        # initialize weights
+        # self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..93b13c9c853d6f7eece8ae2dc7aa67d4e87db68b
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/__init__.py
@@ -0,0 +1 @@
+from .hooks import GradChecker
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ff7fd575c890e60ce49eb618df157b2cc2ca37
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/hooks/hooks.py
@@ -0,0 +1,13 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class GradChecker(Hook):
+
+    def after_train_iter(self, runner):
+        for key, val in runner.model.named_parameters():
+            if val.grad == None and val.requires_grad:
+                print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key))
+
+
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7dd426868a61772bbe0926e435ce89f15009805
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/__init__.py
@@ -0,0 +1 @@
+from .adamw import AdamW2
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..c890aeaf04721580c11ca329f2be09a6a280f773
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/opt/adamw.py
@@ -0,0 +1,131 @@
+try:
+    from torch.optim import _functional as F
+except:
+    print('WARNING!!!, I recommend using torch>=1.8')
+
+import torch
+from torch.optim.optimizer import Optimizer
+from mmcv.runner.optimizer.builder import OPTIMIZERS
+
+@OPTIMIZERS.register_module()
+class AdamW2(Optimizer):
+    r"""Implements AdamW algorithm. Solve the bug of torch 1.8
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW2, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW2, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+
+            # put this line here for solving bug
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            F.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad,
+                    beta1,
+                    beta2,
+                    group['lr'],
+                    group['weight_decay'],
+                    group['eps'])
+
+        return loss
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..921aee7c215a2e2546e943589a85949c2eee7b05
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/__init__.py
@@ -0,0 +1,10 @@
+
+from .bricks import run_time
+from .grid_mask import GridMask
+from .position_embedding import RelPositionEmbedding
+from .visual import save_tensor
+from .inverted_residual import InvertedResidual
+from .se_layer import DyReLU, SELayer
+from .make_divisible import make_divisible
+from .ckpt_convert import swin_convert, vit_convert
+from .embed import PatchEmbed
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd458813d9ffced23b79799daa84150ba887774e
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/bricks.py
@@ -0,0 +1,20 @@
+import functools
+import time
+from collections import defaultdict
+import torch
+time_maps = defaultdict(lambda :0.)
+count_maps = defaultdict(lambda :0.)
+def run_time(name):
+    def middle(fn):
+        def wrapper(*args, **kwargs):
+            torch.cuda.synchronize()
+            start = time.time()
+            res = fn(*args, **kwargs)
+            torch.cuda.synchronize()
+            time_maps['%s : %s'%(name, fn.__name__) ] += time.time()-start
+            count_maps['%s : %s'%(name, fn.__name__) ] +=1
+            print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] ))
+            return res
+        return wrapper
+    return middle
+    
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd4632065ce2109376bebef36e0532c5115125f8
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/ckpt_convert.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+
+def swin_convert(ckpt):
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def vit_convert(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm'):
+            new_k = k.replace('norm.', 'ln1.')
+        elif k.startswith('patch_embed'):
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        elif k.startswith('blocks'):
+            if 'norm' in k:
+                new_k = k.replace('norm', 'ln')
+            elif 'mlp.fc1' in k:
+                new_k = k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = k.replace('mlp.fc2', 'ffn.layers.1')
+            elif 'attn.qkv' in k:
+                new_k = k.replace('attn.qkv.', 'attn.attn.in_proj_')
+            elif 'attn.proj' in k:
+                new_k = k.replace('attn.proj', 'attn.attn.out_proj')
+            else:
+                new_k = k
+            new_k = new_k.replace('blocks.', 'layers.')
+        else:
+            new_k = k
+        new_ckpt[new_k] = v
+
+    return new_ckpt
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0cf143488eafcd1dcd9e80f824807aa47de34fd
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/embed.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from torch.nn.modules.utils import _pair as to_2tuple
+
+
+# Modified from Pytorch-Image-Models
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding V2.
+
+    We use a conv layer to implement PatchEmbed.
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (dict, optional): The config dict for conv layers type
+            selection. Default: None.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Default to be equal with kernel_size).
+        padding (int): The padding length of embedding conv. Default: 0.
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        pad_to_patch_size (bool, optional): Whether to pad feature map shape
+            to multiple patch size. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type=None,
+                 kernel_size=16,
+                 stride=16,
+                 padding=0,
+                 dilation=1,
+                 pad_to_patch_size=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__()
+
+        self.embed_dims = embed_dims
+        self.init_cfg = init_cfg
+
+        if stride is None:
+            stride = kernel_size
+
+        self.pad_to_patch_size = pad_to_patch_size
+
+        # The default setting of patch size is equal to kernel size.
+        patch_size = kernel_size
+        if isinstance(patch_size, int):
+            patch_size = to_2tuple(patch_size)
+        elif isinstance(patch_size, tuple):
+            if len(patch_size) == 1:
+                patch_size = to_2tuple(patch_size[0])
+            assert len(patch_size) == 2, \
+                f'The size of patch should have length 1 or 2, ' \
+                f'but got {len(patch_size)}'
+
+        self.patch_size = patch_size
+
+        # Use conv layer to embed
+        conv_type = conv_type or 'Conv2d'
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        H, W = x.shape[2], x.shape[3]
+
+        # TODO: Process overlapping op
+        if self.pad_to_patch_size:
+            # Modify H, W to multiple of patch size.
+            if H % self.patch_size[0] != 0:
+                x = F.pad(
+                    x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+            if W % self.patch_size[1] != 0:
+                x = F.pad(
+                    x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0))
+
+        x = self.projection(x)
+        self.DH, self.DW = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d04b2c470a24b55fd5a60ca6c679fa9710bc1a7
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/grid_mask.py
@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from mmcv.runner import force_fp32, auto_fp16
+
+class Grid(object):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode=mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.l = np.random.randint(1, d)
+        else:
+            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1-mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask 
+
+        return img, label
+
+
+class GridMask(nn.Module):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.fp16_enable = False
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
+    @auto_fp16()
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n,c,h,w = x.size()
+        x = x.view(-1,h,w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).to(x.dtype).cuda()
+        if self.mode == 1:
+            mask = 1-mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask 
+        
+        return x.view(n,c,h,w)
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..093c8efe85a24ea26c9fd18d6b9cd14c7ce68779
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/inverted_residual.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
+from mmcv.runner import BaseModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(BaseModule):
+    """Inverted Residual Block.
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None):
+        super(InvertedResidual, self).__init__(init_cfg)
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bbc0a6c5b7cef48a9171ef4870201f2aa6cad85
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/make_divisible.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..290110fef7cb86c5edafb0b33da3bed794e6f7a9
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/position_embedding.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+import math
+
+class RelPositionEmbedding(nn.Module):
+    def __init__(self, num_pos_feats=64, pos_norm=True):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.fc = nn.Linear(4, self.num_pos_feats,bias=False)
+        #nn.init.orthogonal_(self.fc.weight)
+        #self.fc.weight.requires_grad = False
+        self.pos_norm = pos_norm
+        if self.pos_norm:
+            self.norm = nn.LayerNorm(self.num_pos_feats)
+    def forward(self, tensor):
+        #mask = nesttensor.mask
+        B,C,H,W = tensor.shape
+        #print('tensor.shape',  tensor.shape)
+        y_range = (torch.arange(H) / float(H - 1)).to(tensor.device)
+        #y_axis = torch.stack((y_range, 1-y_range),dim=1)
+        y_axis = torch.stack((torch.cos(y_range * math.pi), torch.sin(y_range * math.pi)), dim=1)
+        y_axis = y_axis.reshape(H, 1, 2).repeat(1, W, 1).reshape(H * W, 2)
+
+        x_range = (torch.arange(W) / float(W - 1)).to(tensor.device)
+        #x_axis =torch.stack((x_range,1-x_range),dim=1)
+        x_axis = torch.stack((torch.cos(x_range * math.pi), torch.sin(x_range * math.pi)), dim=1)
+        x_axis = x_axis.reshape(1, W, 2).repeat(H, 1, 1).reshape(H * W, 2)
+        x_pos = torch.cat((y_axis, x_axis), dim=1)
+        x_pos = self.fc(x_pos)
+
+        if self.pos_norm:
+            x_pos = self.norm(x_pos)
+        #print('xpos,', x_pos.max(),x_pos.min())
+        return x_pos
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d10615f5a8606909af4e6af4163cf97ad73cc1
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/se_layer.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+
+class SELayer(BaseModule):
+    """Squeeze-and-Excitation Module.
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid')),
+                 init_cfg=None):
+        super(SELayer, self).__init__(init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Default: 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=4,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0)),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return 
\ No newline at end of file
diff --git a/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9718afea9e67199c77da8ecf33249a28197082a
--- /dev/null
+++ b/model_examples/MapTR/projects/mmdet3d_plugin/models/utils/visual.py
@@ -0,0 +1,24 @@
+import torch
+from torchvision.utils import make_grid
+import torchvision
+import matplotlib.pyplot as plt
+import cv2
+
+
+def convert_color(img_path):
+    plt.figure()
+    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+    plt.imsave(img_path, img, cmap=plt.get_cmap('viridis'))
+    plt.close()
+
+
+def save_tensor(tensor, path, pad_value=254.0,):
+    print('save_tensor', path)
+    tensor = tensor.to(torch.float).detach().cpu()
+    if tensor.type() == 'torch.BoolTensor':
+        tensor = tensor*255
+    if len(tensor.shape) == 3:
+        tensor = tensor.unsqueeze(1)
+    tensor = make_grid(tensor, pad_value=pad_value, normalize=False).permute(1, 2, 0).numpy().copy()
+    torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path)
+    convert_color(path)
diff --git a/model_examples/MapTR/replace_patch.sh b/model_examples/MapTR/replace_patch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b55c28482d67d5a4f3fe6546932ad14e7be0ed0c
--- /dev/null
+++ b/model_examples/MapTR/replace_patch.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+for para in $*
+do
+    if [[ $para == --packages_path* ]];then
+        packages_path=`echo ${para#*=}`
+    fi
+done
+
+cp -f patch/mmcv/distributed.py mmcv/mmcv/parallel/distributed.py
+cp -f patch/mmcv/_functions.py mmcv/mmcv/parallel/_functions.py
+cp -f patch/mmcv/deform_conv.py mmcv/mmcv/ops/deform_conv.py
+cp -f patch/mmdet3d/__init__.py mmdetection3d/mmdet3d/__init__.py
+cp -f patch/mmdet/__init__.py ${packages_path}/mmdet/__init__.py
+cp -f patch/mmdet/resnet.py ${packages_path}/mmdet/models/backbones/resnet.py
+cp -f patch/mmseg/__init__.py ${packages_path}/mmseg/__init__.py
+cp -f patch/nuscenes/data_classes.py ${packages_path}/nuscenes/eval/detection/data_classes.py
\ No newline at end of file
diff --git a/model_examples/MapTR/requirement.txt b/model_examples/MapTR/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dfe7951695fd01b89fdd4d94ab6d5aa886a5f6de
--- /dev/null
+++ b/model_examples/MapTR/requirement.txt
@@ -0,0 +1,10 @@
+torchaudio==0.10.0
+torchvision==0.10.0
+mmdet==2.14.0
+mmsegmentation==0.14.1
+timm
+shapely==1.8.5.post1
+av2
+numba==0.58.1
+numpy==1.23.0
+ipython==8.12.3
\ No newline at end of file
diff --git a/model_examples/MapTR/test/env_npu.sh b/model_examples/MapTR/test/env_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d4195f46ca9e8de1dd2387a8fe618b48eaaf8498
--- /dev/null
+++ b/model_examples/MapTR/test/env_npu.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d {CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+# 绑核
+export CPU_AFFINITY_CONF=1
+# 是否开启taskque
+export TASK_QUEUE_ENABLE=2
+# 设置shape数据缓存
+export HOST_CACHE_CAPACITY=20
+# 开启combined标志
+export COMBINED_ENABLE=1
+# 启用可扩展内存段分配策略
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+#设置device侧日志登记为error
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+#关闭Device侧Event日志
+msnpureport -e disable
+
+path_lib=$(python3 -c """
+import sys
+import re
+result=''
+for index in range(len(sys.path)):
+    match_sit = re.search('-packages', sys.path[index])
+    if match_sit is not None:
+        match_lib = re.search('lib', sys.path[index])
+
+        if match_lib is not None:
+            end=match_lib.span()[1]
+            result += sys.path[index][0:end] + ':'
+
+        result+=sys.path[index] + '/torch/lib:'
+print(result)"""
+)
diff --git a/model_examples/MapTR/test/train_8p.sh b/model_examples/MapTR/test/train_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..defe40e8a9d99010e673694a95a88f34ed875455
--- /dev/null
+++ b/model_examples/MapTR/test/train_8p.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+################基础配置参数，需要模型审视修改##################
+# 网络名称，同目录名称
+Network="MapTR"
+WORLD_SIZE=8
+WORK_DIR=""
+LOAD_FROM=""
+
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=$(pwd)
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+  test_path_dir=${cur_path}
+  cd ..
+  cur_path=$(pwd)
+else
+  test_path_dir=${cur_path}/test
+fi
+
+ASCEND_DEVICE_ID=0
+
+if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ]; then
+  rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+  mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+else
+  mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+fi
+
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=$(env | grep etp_running_flag)
+etp_flag=$(echo ${acheck_etp_flag#*=})
+if [ x"${etp_flag}" != x"true" ]; then
+  source ${test_path_dir}/env_npu.sh
+fi
+
+bash ./tools/dist_train.sh ./projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py 8 \
+    >$cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+# 训练用例信息，不需要修改
+BatchSize=8
+DeviceType=$(uname -m)
+CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+avg_time=`grep -a 'mmdet - INFO - Epoch '  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'`
+Iteration_time=$avg_time
+# 打印，不需要修改
+echo "Iteration time : $Iteration_time"
+
+# 输出训练精度mAP,需要模型审视修改
+mAP=$(grep -a "mmdet - INFO - Epoch(val)" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log  |tail -1|awk -F "mAP: " '{print $2}' |awk -F ", " '{print $1}'| awk -F ", " '{print $1}')
+
+# 打印，不需要修改
+echo "mAP : ${mAP}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 训练总时长
+TrainingTime=`grep -a 'Time'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "Time: " '{print $2}'|awk -F "," '{print $1}'| awk '{a+=$1} END {printf("%.3f",a)}'`
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainmAP = ${mAP}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "Iteration time = ${Iteration_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
\ No newline at end of file
diff --git a/model_examples/MapTR/test/train_8p_performance.sh b/model_examples/MapTR/test/train_8p_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..defe40e8a9d99010e673694a95a88f34ed875455
--- /dev/null
+++ b/model_examples/MapTR/test/train_8p_performance.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+################基础配置参数，需要模型审视修改##################
+# 网络名称，同目录名称
+Network="MapTR"
+WORLD_SIZE=8
+WORK_DIR=""
+LOAD_FROM=""
+
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=$(pwd)
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+  test_path_dir=${cur_path}
+  cd ..
+  cur_path=$(pwd)
+else
+  test_path_dir=${cur_path}/test
+fi
+
+ASCEND_DEVICE_ID=0
+
+if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ]; then
+  rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+  mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+else
+  mkdir -p ${cur_path}/test/output/${ASCEND_DEVICE_ID}
+fi
+
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=$(env | grep etp_running_flag)
+etp_flag=$(echo ${acheck_etp_flag#*=})
+if [ x"${etp_flag}" != x"true" ]; then
+  source ${test_path_dir}/env_npu.sh
+fi
+
+bash ./tools/dist_train.sh ./projects/configs/maptr/maptr_tiny_r50_24e_bevformer.py 8 \
+    >$cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+wait
+
+
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+# 训练用例信息，不需要修改
+BatchSize=8
+DeviceType=$(uname -m)
+CaseName=${Network}_bs${BatchSize}_${WORLD_SIZE}'p'_'acc'
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+avg_time=`grep -a 'mmdet - INFO - Epoch '  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "time: " '{print $2}' | awk -F ", " '{print $1}' | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'`
+Iteration_time=$avg_time
+# 打印，不需要修改
+echo "Iteration time : $Iteration_time"
+
+# 输出训练精度mAP,需要模型审视修改
+mAP=$(grep -a "mmdet - INFO - Epoch(val)" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log  |tail -1|awk -F "mAP: " '{print $2}' |awk -F ", " '{print $1}'| awk -F ", " '{print $1}')
+
+# 打印，不需要修改
+echo "mAP : ${mAP}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 训练总时长
+TrainingTime=`grep -a 'Time'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "Time: " '{print $2}'|awk -F "," '{print $1}'| awk '{a+=$1} END {printf("%.3f",a)}'`
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${WORLD_SIZE}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainmAP = ${mAP}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "Iteration time = ${Iteration_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "TrainingTime = ${TrainingTime}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
+echo "E2ETrainingTime = ${e2e_time}" >>${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}_perf_report.log
\ No newline at end of file
diff --git a/model_examples/MapTR/tools/analysis_tools/__init__.py b/model_examples/MapTR/tools/analysis_tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_examples/MapTR/tools/analysis_tools/analyze_logs.py b/model_examples/MapTR/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..806175f34c0ce6c535167cc7db8470c69a6e243d
--- /dev/null
+++ b/model_examples/MapTR/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/analysis_tools/benchmark.py b/model_examples/MapTR/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..487a348935e3c949a8cde2c90a1747db769964c9
--- /dev/null
+++ b/model_examples/MapTR/tools/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/analysis_tools/get_params.py b/model_examples/MapTR/tools/analysis_tools/get_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..2818900b260f31faf6249b6862b9bb8d80a4ad02
--- /dev/null
+++ b/model_examples/MapTR/tools/analysis_tools/get_params.py
@@ -0,0 +1,10 @@
+import torch
+file_path = './ckpts/bevformer_v4.pth'
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+    all += model['state_dict'][key].nelement()
+print(all)
+
+# smaller 63374123
+# v4 69140395
diff --git a/model_examples/MapTR/tools/analysis_tools/visual.py b/model_examples/MapTR/tools/analysis_tools/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..648129897cadb7692cb7fccf18bf59b73d085e3f
--- /dev/null
+++ b/model_examples/MapTR/tools/analysis_tools/visual.py
@@ -0,0 +1,477 @@
+# Based on https://github.com/nutonomy/nuscenes-devkit
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import mmcv
+from nuscenes.nuscenes import NuScenes
+from PIL import Image
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from typing import Tuple, List, Iterable
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from tqdm import tqdm
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.detection.render import visualize_sample
+
+
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+import numpy as np
+import matplotlib.pyplot as plt
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from PIL import Image
+from matplotlib import rcParams
+
+
+def render_annotation(
+        anntoken: str,
+        margin: float = 10,
+        view: np.ndarray = np.eye(4),
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        out_path: str = 'render.png',
+        extra_info: bool = False) -> None:
+    """
+    Render selected annotation.
+    :param anntoken: Sample_annotation token.
+    :param margin: How many meters in each direction to include in LIDAR view.
+    :param view: LIDAR view point.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param extra_info: Whether to render extra information below camera view.
+    """
+    ann_record = nusc.get('sample_annotation', anntoken)
+    sample_record = nusc.get('sample', ann_record['sample_token'])
+    assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+    # Figure out which camera the object is fully visible in (this may return nothing).
+    boxes, cam = [], []
+    cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+    all_bboxes = []
+    select_cams = []
+    for cam in cams:
+        _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+                                           selected_anntokens=[anntoken])
+        if len(boxes) > 0:
+            all_bboxes.append(boxes)
+            select_cams.append(cam)
+            # We found an image that matches. Let's abort.
+    # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+    #                      'Try using e.g. BoxVisibility.ANY.'
+    # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+    num_cam = len(all_bboxes)
+
+    fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+    select_cams = [sample_record['data'][cam] for cam in select_cams]
+    print('bbox in cams:', select_cams)
+    # Plot LIDAR view.
+    lidar = sample_record['data']['LIDAR_TOP']
+    data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+    LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+    for box in boxes:
+        c = np.array(get_color(box.name)) / 255.0
+        box.render(axes[0], view=view, colors=(c, c, c))
+        corners = view_points(boxes[0].corners(), view, False)[:2, :]
+        axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+        axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+        axes[0].axis('off')
+        axes[0].set_aspect('equal')
+
+    # Plot CAMERA view.
+    for i in range(1, num_cam + 1):
+        cam = select_cams[i - 1]
+        data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+        im = Image.open(data_path)
+        axes[i].imshow(im)
+        axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+        axes[i].axis('off')
+        axes[i].set_aspect('equal')
+        for box in boxes:
+            c = np.array(get_color(box.name)) / 255.0
+            box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+        # Print extra information about the annotation below the camera view.
+        axes[i].set_xlim(0, im.size[0])
+        axes[i].set_ylim(im.size[1], 0)
+
+    if extra_info:
+        rcParams['font.family'] = 'monospace'
+
+        w, l, h = ann_record['size']
+        category = ann_record['category_name']
+        lidar_points = ann_record['num_lidar_pts']
+        radar_points = ann_record['num_radar_pts']
+
+        sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+        dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+        information = ' \n'.join(['category: {}'.format(category),
+                                  '',
+                                  '# lidar points: {0:>4}'.format(lidar_points),
+                                  '# radar points: {0:>4}'.format(radar_points),
+                                  '',
+                                  'distance: {:>7.3f}m'.format(dist),
+                                  '',
+                                  'width:  {:>7.3f}m'.format(w),
+                                  'length: {:>7.3f}m'.format(l),
+                                  'height: {:>7.3f}m'.format(h)])
+
+        plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+    if out_path is not None:
+        plt.savefig(out_path)
+
+
+
+def get_sample_data(sample_data_token: str,
+                    box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                    selected_anntokens=None,
+                    use_flat_vehicle_coordinates: bool = False):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    if selected_anntokens is not None:
+        boxes = list(map(nusc.get_box, selected_anntokens))
+    else:
+        boxes = nusc.get_boxes(sample_data_token)
+
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+
+def get_predicted_data(sample_data_token: str,
+                       box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                       selected_anntokens=None,
+                       use_flat_vehicle_coordinates: bool = False,
+                       pred_anns=None
+                       ):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    # if selected_anntokens is not None:
+    #    boxes = list(map(nusc.get_box, selected_anntokens))
+    # else:
+    #    boxes = nusc.get_boxes(sample_data_token)
+    boxes = pred_anns
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+
+
+def lidiar_render(sample_token, data,out_path=None):
+    bbox_gt_list = []
+    bbox_pred_list = []
+    anns = nusc.get('sample', sample_token)['anns']
+    for ann in anns:
+        content = nusc.get('sample_annotation', ann)
+        try:
+            bbox_gt_list.append(DetectionBox(
+                sample_token=content['sample_token'],
+                translation=tuple(content['translation']),
+                size=tuple(content['size']),
+                rotation=tuple(content['rotation']),
+                velocity=nusc.box_velocity(content['token'])[:2],
+                ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                else tuple(content['ego_translation']),
+                num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                detection_name=category_to_detection_name(content['category_name']),
+                detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                attribute_name=''))
+        except:
+            pass
+
+    bbox_anns = data['results'][sample_token]
+    for content in bbox_anns:
+        bbox_pred_list.append(DetectionBox(
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name']))
+    gt_annotations = EvalBoxes()
+    pred_annotations = EvalBoxes()
+    gt_annotations.add_boxes(sample_token, bbox_gt_list)
+    pred_annotations.add_boxes(sample_token, bbox_pred_list)
+    print('green is ground truth')
+    print('blue is the predited result')
+    visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev')
+
+
+def get_color(category_name: str):
+    """
+    Provides the default colors based on the category names.
+    This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+    """
+    a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+     'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+     'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+     'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+     'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+     'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+     'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+     'vehicle.ego']
+    class_names = [
+        'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+        'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+    ]
+    #print(category_name)
+    if category_name == 'bicycle':
+        return nusc.colormap['vehicle.bicycle']
+    elif category_name == 'construction_vehicle':
+        return nusc.colormap['vehicle.construction']
+    elif category_name == 'traffic_cone':
+        return nusc.colormap['movable_object.trafficcone']
+
+    for key in nusc.colormap.keys():
+        if category_name in key:
+            return nusc.colormap[key]
+    return [0, 0, 0]
+
+
+def render_sample_data(
+        sample_toekn: str,
+        with_anns: bool = True,
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        axes_limit: float = 40,
+        ax=None,
+        nsweeps: int = 1,
+        out_path: str = None,
+        underlay_map: bool = True,
+        use_flat_vehicle_coordinates: bool = True,
+        show_lidarseg: bool = False,
+        show_lidarseg_legend: bool = False,
+        filter_lidarseg_labels=None,
+        lidarseg_preds_bin_path: str = None,
+        verbose: bool = True,
+        show_panoptic: bool = False,
+        pred_data=None,
+      ) -> None:
+    """
+    Render sample data onto axis.
+    :param sample_data_token: Sample_data token.
+    :param with_anns: Whether to draw box annotations.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param axes_limit: Axes limit for lidar and radar (measured in meters).
+    :param ax: Axes onto which to render.
+    :param nsweeps: Number of sweeps for lidar and radar.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+        aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+        can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+        setting is more correct and rotates the plot by ~90 degrees.
+    :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+    :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+    :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+        or the list is empty, all classes will be displayed.
+    :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+                                    predictions for the sample.
+    :param verbose: Whether to display the image after it is rendered.
+    :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+        If show_lidarseg is True, show_panoptic will be set to False.
+    """
+    lidiar_render(sample_toekn, pred_data, out_path=out_path)
+    sample = nusc.get('sample', sample_toekn)
+    # sample = data['results'][sample_token_list[0]][0]
+    cams = [
+        'CAM_FRONT_LEFT',
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_BACK_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_RIGHT',
+    ]
+    if ax is None:
+        _, ax = plt.subplots(4, 3, figsize=(24, 18))
+    j = 0
+    for ind, cam in enumerate(cams):
+        sample_data_token = sample['data'][cam]
+
+        sd_record = nusc.get('sample_data', sample_data_token)
+        sensor_modality = sd_record['sensor_modality']
+
+        if sensor_modality in ['lidar', 'radar']:
+            assert False
+        elif sensor_modality == 'camera':
+            # Load boxes and image.
+            boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+                         name=record['detection_name'], token='predicted') for record in
+                     pred_data['results'][sample_toekn] if record['detection_score'] > 0.2]
+
+            data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+                                                                         box_vis_level=box_vis_level, pred_anns=boxes)
+            _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level)
+            if ind == 3:
+                j += 1
+            ind = ind % 3
+            data = Image.open(data_path)
+            # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png')
+            # Init axes.
+
+            # Show image.
+            ax[j, ind].imshow(data)
+            ax[j + 2, ind].imshow(data)
+
+            # Show boxes.
+            if with_anns:
+                for box in boxes_pred:
+                    c = np.array(get_color(box.name)) / 255.0
+                    box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+                for box in boxes_gt:
+                    c = np.array(get_color(box.name)) / 255.0
+                    box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+            # Limit visible range.
+            ax[j, ind].set_xlim(0, data.size[0])
+            ax[j, ind].set_ylim(data.size[1], 0)
+            ax[j + 2, ind].set_xlim(0, data.size[0])
+            ax[j + 2, ind].set_ylim(data.size[1], 0)
+
+        else:
+            raise ValueError("Error: Unknown sensor modality!")
+
+        ax[j, ind].axis('off')
+        ax[j, ind].set_title('PRED: {} {labels_type}'.format(
+            sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+        ax[j, ind].set_aspect('equal')
+
+        ax[j + 2, ind].axis('off')
+        ax[j + 2, ind].set_title('GT:{} {labels_type}'.format(
+            sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+        ax[j + 2, ind].set_aspect('equal')
+
+    if out_path is not None:
+        plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200)
+    if verbose:
+        plt.show()
+    plt.close()
+
+if __name__ == '__main__':
+    nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+    # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52')
+    bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json')
+    sample_token_list = list(bevformer_results['results'].keys())
+    for id in range(0, 10):
+        render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id])
diff --git a/model_examples/MapTR/tools/create_data.py b/model_examples/MapTR/tools/create_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b0cc10f1fafa77a39cd8fbd9c1ac9386d2af72
--- /dev/null
+++ b/model_examples/MapTR/tools/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+    """Prepare data related to Kitti dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        out_dir (str): Output directory of the groundtruth database info.
+    """
+    kitti.create_kitti_info_file(root_path, info_prefix)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(root_path,
+                                  f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+    kitti.export_2d_annotation(root_path, info_train_path)
+    kitti.export_2d_annotation(root_path, info_val_path)
+    kitti.export_2d_annotation(root_path, info_trainval_path)
+    kitti.export_2d_annotation(root_path, info_test_path)
+
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
+    else:
+        info_train_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+        info_val_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_train_path, version=version)
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_val_path, version=version)
+        # create_groundtruth_database(dataset_name, root_path, info_prefix,
+        #                             f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+    """Prepare data related to Lyft dataset.
+
+    Related data consists of '.pkl' files recording basic infos.
+    Although the ground truth database and 2D annotations are not used in
+    Lyft, it can also be generated like nuScenes.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Defaults to 10.
+    """
+    lyft_converter.create_lyft_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for s3dis dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for sunrgbd dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    workers,
+                    max_sweeps=5):
+    """Prepare the info file for waymo dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        max_sweeps (int): Number of input consecutive frames. Default: 5 \
+            Here we store pose information of these frames for later use.
+    """
+    from tools.data_converter import waymo_converter as waymo
+
+    splits = ['training', 'validation', 'testing']
+
+    for i, split in enumerate(splits):
+        load_dir = osp.join(root_path, 'waymo_format', split)
+        if split == 'validation':
+            save_dir = osp.join(out_dir, 'kitti_format', 'training')
+        else:
+            save_dir = osp.join(out_dir, 'kitti_format', split)
+        converter = waymo.Waymo2KITTI(
+            load_dir,
+            save_dir,
+            prefix=str(i),
+            workers=workers,
+            test_mode=(split == 'test'))
+        converter.convert()
+    # Generate waymo infos
+    out_dir = osp.join(out_dir, 'kitti_format')
+    kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+    create_groundtruth_database(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'kitti':
+        kitti_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'lyft':
+        train_version = f'{args.version}-train'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'scannet':
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 's3dis':
+        s3dis_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'sunrgbd':
+        sunrgbd_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
diff --git a/model_examples/MapTR/tools/data_converter/__init__.py b/model_examples/MapTR/tools/data_converter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/model_examples/MapTR/tools/data_converter/av2_converter.py b/model_examples/MapTR/tools/data_converter/av2_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..024e33399bf460afeceeb52aa857de975505afc5
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/av2_converter.py
@@ -0,0 +1,204 @@
+from functools import partial
+from multiprocessing import Pool
+import multiprocessing
+from random import sample
+import time
+import mmcv
+import logging
+from pathlib import Path
+from os import path as osp
+import os
+from av2.datasets.sensor.av2_sensor_dataloader import AV2SensorDataLoader
+from av2.map.lane_segment import LaneMarkType, LaneSegment
+from av2.map.map_api import ArgoverseStaticMap
+from tqdm import tqdm
+import argparse
+
+CAM_NAMES = ['ring_front_center', 'ring_front_right', 'ring_front_left',
+    'ring_rear_right','ring_rear_left', 'ring_side_right', 'ring_side_left',
+    # 'stereo_front_left', 'stereo_front_right',
+    ]
+# some fail logs as stated in av2
+# https://github.com/argoverse/av2-api/blob/05b7b661b7373adb5115cf13378d344d2ee43906/src/av2/map/README.md#training-online-map-inference-models
+FAIL_LOGS = [
+    '75e8adad-50a6-3245-8726-5e612db3d165',
+    '54bc6dbc-ebfb-3fba-b5b3-57f88b4b79ca',
+    'af170aac-8465-3d7b-82c5-64147e94af7d',
+    '6e106cf8-f6dd-38f6-89c8-9be7a71e7275',
+    '01bb304d-7bd8-35f8-bbef-7086b688e35e',
+    '453e5558-6363-38e3-bf9b-42b5ba0a6f1d'
+]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Data converter arg parser')
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        help='specify the root path of dataset')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=64,
+        required=False,
+        help='workers to process data')
+    args_p = parser.parse_args()
+    return args_p
+
+def create_av2_infos_mp(root_path,
+                        info_prefix,
+                        dest_path=None,
+                        split='train',
+                        num_multithread=64):
+    """Create info file of av2 dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        dest_path (str): Path to store generated file, default to root_path
+        split (str): Split of the data.
+            Default: 'train'
+    """
+    root_path = osp.join(root_path, split)
+    if dest_path is None:
+        dest_path = root_path
+    
+    loader = AV2SensorDataLoader(Path(root_path), Path(root_path))
+    log_ids = list(loader.get_log_ids())
+    # import pdb;pdb.set_trace()
+    for l in FAIL_LOGS:
+        if l in log_ids:
+            log_ids.remove(l)
+
+    print('collecting samples...')
+    start_time = time.time()
+    print('num cpu:', multiprocessing.cpu_count())
+    print(f'using {num_multithread} threads')
+
+    # to supress logging from av2.utils.synchronization_database
+    sdb_logger = logging.getLogger('av2.utils.synchronization_database')
+    prev_level = sdb_logger.level
+    sdb_logger.setLevel(logging.CRITICAL)
+
+    # FIXME: need to check the order
+    pool = Pool(num_multithread)
+    fn = partial(get_data_from_logid, loader=loader, data_root=root_path)
+    rt = pool.map_async(fn, log_ids)
+    pool.close()
+    pool.join()
+    results = rt.get()
+
+    samples = []
+    discarded = 0
+    sample_idx = 0
+    for _samples, _discarded in results:
+        for i in range(len(_samples)):
+            _samples[i]['sample_idx'] = sample_idx
+            sample_idx += 1
+        samples += _samples
+        discarded += _discarded
+    
+    sdb_logger.setLevel(prev_level)
+    print(f'{len(samples)} available samples, {discarded} samples discarded')
+
+    id2map = {}
+    for log_id in log_ids:
+        log_map_dirpath = Path(osp.join(root_path, log_id, "map"))
+        vector_data_fnames = sorted(log_map_dirpath.glob("log_map_archive_*.json"))
+        # vector_data_fnames = sorted(log_map_dirpath.glob("log_map_archive_*.json"))
+        if not len(vector_data_fnames) == 1:
+            raise RuntimeError(f"JSON file containing vector map data is missing (searched in {log_map_dirpath})")
+        vector_data_fname = vector_data_fnames[0]
+        vector_data_json_path = vector_data_fname
+        avm = ArgoverseStaticMap.from_json(vector_data_json_path)
+        # import pdb;pdb.set_trace()
+        map_elements = {}
+        map_elements['divider'] = get_divider(avm)
+        map_elements['ped_crossing'] = get_ped(avm)
+        map_elements['boundary'] = get_boundary(avm)
+
+        # map_fname = osp.join(map_path_dir, map_fname)
+        id2map[log_id] = map_elements
+
+    print('collected in {}s'.format(time.time()-start_time))
+    infos = dict(samples=samples, id2map=id2map)
+
+    info_path = osp.join(dest_path,
+                                 '{}_map_infos_{}.pkl'.format(info_prefix, split))
+    print(f'saving results to {info_path}')
+    mmcv.dump(infos, info_path)
+    # mmcv.dump(samples, info_path)
+
+def get_divider(avm):
+    divider_list = []
+    for ls in avm.get_scenario_lane_segments():
+            for bound_type, bound_city in zip([ls.left_mark_type, ls.right_mark_type], [ls.left_lane_boundary, ls.right_lane_boundary]):
+                if bound_type not in [LaneMarkType.NONE,]:
+                    divider_list.append(bound_city.xyz)
+    return divider_list
+
+def get_boundary(avm):
+    boundary_list = []
+    for da in avm.get_scenario_vector_drivable_areas():
+        boundary_list.append(da.xyz)
+    return boundary_list
+
+def get_ped(avm):
+    ped_list = []
+    for pc in avm.get_scenario_ped_crossings():
+        ped_list.append(pc.polygon)
+    return ped_list
+
+def get_data_from_logid(log_id, loader: AV2SensorDataLoader, data_root):
+    samples = []
+    discarded = 0
+    
+    # We use lidar timestamps to query all sensors.
+    # The frequency is 10Hz
+    cam_timestamps = loader._sdb.per_log_lidar_timestamps_index[log_id]
+    for ts in cam_timestamps:
+        cam_ring_fpath = [loader.get_closest_img_fpath(
+                log_id, cam_name, ts
+            ) for cam_name in CAM_NAMES]
+        lidar_fpath = loader.get_closest_lidar_fpath(log_id, ts)
+
+        # If bad sensor synchronization, discard the sample
+        if None in cam_ring_fpath or lidar_fpath is None:
+            discarded += 1
+            continue
+
+        cams = {}
+        for i, cam_name in enumerate(CAM_NAMES):
+            pinhole_cam = loader.get_log_pinhole_camera(log_id, cam_name)
+            cams[cam_name] = dict(
+                img_fpath=str(cam_ring_fpath[i]),
+                intrinsics=pinhole_cam.intrinsics.K,
+                extrinsics=pinhole_cam.extrinsics,
+            )
+        
+        city_SE3_ego = loader.get_city_SE3_ego(log_id, int(ts))
+        e2g_translation = city_SE3_ego.translation
+        e2g_rotation = city_SE3_ego.rotation
+        
+        samples.append(dict(
+            e2g_translation=e2g_translation,
+            e2g_rotation=e2g_rotation,
+            cams=cams, 
+            lidar_fpath=str(lidar_fpath),
+            # map_fpath=map_fname,
+            timestamp=str(ts),
+            log_id=log_id,
+            token=str(log_id+'_'+str(ts))))
+
+    return samples, discarded
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    for name in ['train', 'val', 'test']:
+        create_av2_infos_mp(
+            root_path=args.data_root,
+            split=name,
+            info_prefix='av2',
+            dest_path=args.data_root,)
\ No newline at end of file
diff --git a/model_examples/MapTR/tools/data_converter/create_gt_database.py b/model_examples/MapTR/tools/data_converter/create_gt_database.py
new file mode 100644
index 0000000000000000000000000000000000000000..7317cedd08377643018b7d4a72f7b5c96397b59c
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmdet3d.core.bbox import box_np_ops as box_np_ops
+from mmdet3d.datasets import build_dataset
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    """Given the raw data, generate the ground truth database.
+
+    Args:
+        dataset_class_name （str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str): Path of the info file.
+            Default: None.
+        mask_anno_path (str): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str]): Classes have been used.
+            Default: None.
+        database_save_path (str): Path to save database.
+            Default: None.
+        db_info_save_path (str): Path to save db_info.
+            Default: None.
+        relative_path (bool): Whether to use relative path.
+            Default: True.
+        with_mask (bool): Whether to use mask.
+            Default: False.
+    """
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name, data_root=data_path, ann_file=info_path)
+    if dataset_class_name == 'KittiDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=with_mask,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=4,
+                    use_dim=4,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    elif dataset_class_name == 'NuScenesDataset':
+        dataset_cfg.update(
+            use_valid_flag=True,
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=5,
+                    use_dim=5),
+                dict(
+                    type='LoadPointsFromMultiSweeps',
+                    sweeps_num=10,
+                    use_dim=[0, 1, 2, 3, 4],
+                    pad_empty_sweeps=True,
+                    remove_close=True),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True)
+            ])
+
+    elif dataset_class_name == 'WaymoDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=False,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=6,
+                    use_dim=5,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    dataset = build_dataset(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(data_path,
+                                     f'{info_prefix}_dbinfos_train.pkl')
+    mmcv.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        input_dict = dataset.get_data_info(j)
+        dataset.pre_pipeline(input_dict)
+        example = dataset.pipeline(input_dict)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].tensor.numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+        names = annos['gt_names']
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(database_save_path, filename)
+            rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
diff --git a/model_examples/MapTR/tools/data_converter/indoor_converter.py b/model_examples/MapTR/tools/data_converter/indoor_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4072397605992869f63889c1b8d1dda5ad44817c
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/indoor_converter.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+
+from tools.data_converter.s3dis_data_utils import S3DISData, S3DISSegData
+from tools.data_converter.scannet_data_utils import ScanNetData, ScanNetSegData
+from tools.data_converter.sunrgbd_data_utils import SUNRGBDData
+
+
+def create_indoor_info_file(data_path,
+                            pkl_prefix='sunrgbd',
+                            save_path=None,
+                            use_v1=False,
+                            workers=4):
+    """Create indoor information file.
+
+    Get information of the raw data and save it to the pkl file.
+
+    Args:
+        data_path (str): Path of the data.
+        pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'.
+        save_path (str): Path of the pkl to be saved. Default: None.
+        use_v1 (bool): Whether to use v1. Default: False.
+        workers (int): Number of threads to be used. Default: 4.
+    """
+    assert os.path.exists(data_path)
+    assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \
+        f'unsupported indoor dataset {pkl_prefix}'
+    save_path = data_path if save_path is None else save_path
+    assert os.path.exists(save_path)
+
+    # generate infos for both detection and segmentation task
+    if pkl_prefix in ['sunrgbd', 'scannet']:
+        train_filename = os.path.join(save_path,
+                                      f'{pkl_prefix}_infos_train.pkl')
+        val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl')
+        if pkl_prefix == 'sunrgbd':
+            # SUN RGB-D has a train-val split
+            train_dataset = SUNRGBDData(
+                root_path=data_path, split='train', use_v1=use_v1)
+            val_dataset = SUNRGBDData(
+                root_path=data_path, split='val', use_v1=use_v1)
+        else:
+            # ScanNet has a train-val-test split
+            train_dataset = ScanNetData(root_path=data_path, split='train')
+            val_dataset = ScanNetData(root_path=data_path, split='val')
+            test_dataset = ScanNetData(root_path=data_path, split='test')
+            test_filename = os.path.join(save_path,
+                                         f'{pkl_prefix}_infos_test.pkl')
+
+        infos_train = train_dataset.get_infos(
+            num_workers=workers, has_label=True)
+        mmcv.dump(infos_train, train_filename, 'pkl')
+        print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+        infos_val = val_dataset.get_infos(num_workers=workers, has_label=True)
+        mmcv.dump(infos_val, val_filename, 'pkl')
+        print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+    if pkl_prefix == 'scannet':
+        infos_test = test_dataset.get_infos(
+            num_workers=workers, has_label=False)
+        mmcv.dump(infos_test, test_filename, 'pkl')
+        print(f'{pkl_prefix} info test file is saved to {test_filename}')
+
+    # generate infos for the semantic segmentation task
+    # e.g. re-sampled scene indexes and label weights
+    # scene indexes are used to re-sample rooms with different number of points
+    # label weights are used to balance classes with different number of points
+    if pkl_prefix == 'scannet':
+        # label weight computation function is adopted from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        train_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=train_filename,
+            split='train',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # TODO: do we need to generate on val set?
+        val_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=val_filename,
+            split='val',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # no need to generate for test set
+        train_dataset.get_seg_infos()
+        val_dataset.get_seg_infos()
+    elif pkl_prefix == 's3dis':
+        # S3DIS doesn't have a fixed train-val split
+        # it has 6 areas instead, so we generate info file for each of them
+        # in training, we will use dataset to wrap different areas
+        splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+        for split in splits:
+            dataset = S3DISData(root_path=data_path, split=split)
+            info = dataset.get_infos(num_workers=workers, has_label=True)
+            filename = os.path.join(save_path,
+                                    f'{pkl_prefix}_infos_{split}.pkl')
+            mmcv.dump(info, filename, 'pkl')
+            print(f'{pkl_prefix} info {split} file is saved to {filename}')
+            seg_dataset = S3DISSegData(
+                data_root=data_path,
+                ann_file=filename,
+                split=split,
+                num_points=4096,
+                label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+            seg_dataset.get_seg_infos()
diff --git a/model_examples/MapTR/tools/data_converter/kitti_converter.py b/model_examples/MapTR/tools/data_converter/kitti_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..47eec6abc0de1ee62283b056d901ef6b4c592960
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/kitti_converter.py
@@ -0,0 +1,546 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from collections import OrderedDict
+from nuscenes.utils.geometry_utils import view_points
+from pathlib import Path
+
+from mmdet3d.core.bbox import box_np_ops
+from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')
+
+
+def convert_to_kitti_info_version2(info):
+    """convert kitti info v1 to v2 if possible.
+
+    Args:
+        info (dict): Info of the input kitti data.
+            - image (dict): image info
+            - calib (dict): calibration info
+            - point_cloud (dict): point cloud info
+    """
+    if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+        info['image'] = {
+            'image_shape': info['img_shape'],
+            'image_idx': info['image_idx'],
+            'image_path': info['img_path'],
+        }
+        info['calib'] = {
+            'R0_rect': info['calib/R0_rect'],
+            'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+            'P2': info['calib/P2'],
+        }
+        info['point_cloud'] = {
+            'velodyne_path': info['velodyne_path'],
+        }
+
+
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+
+
+def _calculate_num_points_in_gt(data_path,
+                                infos,
+                                relative_path,
+                                remove_outside=True,
+                                num_features=4):
+    for info in mmcv.track_iter_progress(infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if relative_path:
+            v_path = str(Path(data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+        # points_v = points_v[points_v[:, 0] > 0]
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+                           pkl_prefix='kitti',
+                           save_path=None,
+                           relative_path=True):
+    """Create info file of KITTI dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    kitti_infos_train = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Kitti info train file is saved to {filename}')
+    mmcv.dump(kitti_infos_train, filename)
+    kitti_infos_val = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=val_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Kitti info val file is saved to {filename}')
+    mmcv.dump(kitti_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Kitti info trainval file is saved to {filename}')
+    mmcv.dump(kitti_infos_train + kitti_infos_val, filename)
+
+    kitti_infos_test = get_kitti_image_info(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        image_ids=test_img_ids,
+        relative_path=relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Kitti info test file is saved to {filename}')
+    mmcv.dump(kitti_infos_test, filename)
+
+
+def create_waymo_info_file(data_path,
+                           pkl_prefix='waymo',
+                           save_path=None,
+                           relative_path=True,
+                           max_sweeps=5):
+    """Create info file of waymo dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str | None): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+        max_sweeps (int): Max sweeps before the detection frame to be used.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+    # val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+    train_img_ids = [each for each in train_img_ids if each % 5 == 0]
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    waymo_infos_train = get_waymo_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps)
+    _calculate_num_points_in_gt(
+        data_path,
+        waymo_infos_train,
+        relative_path,
+        num_features=6,
+        remove_outside=False)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Waymo info train file is saved to {filename}')
+    mmcv.dump(waymo_infos_train, filename)
+    #
+    # waymo_infos_val = get_waymo_image_info(
+    #     data_path,
+    #     training=True,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=val_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # _calculate_num_points_in_gt(
+    #     data_path,
+    #     waymo_infos_val,
+    #     relative_path,
+    #     num_features=6,
+    #     remove_outside=False)
+    # filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    # print(f'Waymo info val file is saved to {filename}')
+    # mmcv.dump(waymo_infos_val, filename)
+    # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    # print(f'Waymo info trainval file is saved to {filename}')
+    # mmcv.dump(waymo_infos_train + waymo_infos_val, filename)
+    # waymo_infos_test = get_waymo_image_info(
+    #     data_path,
+    #     training=False,
+    #     label_info=False,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=test_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    # print(f'Waymo info test file is saved to {filename}')
+    # mmcv.dump(waymo_infos_test, filename)
+
+
+def _create_reduced_point_cloud(data_path,
+                                info_path,
+                                save_path=None,
+                                back=False,
+                                num_features=4,
+                                front_camera_id=2):
+    """Create reduced point clouds for given info.
+
+    Args:
+        data_path (str): Path of original data.
+        info_path (str): Path of data info.
+        save_path (str | None): Path to save reduced point cloud data.
+            Default: None.
+        back (bool): Whether to flip the points to back.
+        num_features (int): Number of point features. Default: 4.
+        front_camera_id (int): The referenced/front camera ID. Default: 2.
+    """
+    kitti_infos = mmcv.load(info_path)
+
+    for info in mmcv.track_iter_progress(kitti_infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+
+        v_path = pc_info['velodyne_path']
+        v_path = Path(data_path) / v_path
+        points_v = np.fromfile(
+            str(v_path), dtype=np.float32,
+            count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        if front_camera_id == 2:
+            P2 = calib['P2']
+        else:
+            P2 = calib[f'P{str(front_camera_id)}']
+        Trv2c = calib['Tr_velo_to_cam']
+        # first remove z < 0 points
+        # keep = points_v[:, -1] > 0
+        # points_v = points_v[keep]
+        # then remove outside.
+        if back:
+            points_v[:, 0] = -points_v[:, 0]
+        points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+                                                    image_info['image_shape'])
+        if save_path is None:
+            save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+            if not save_dir.exists():
+                save_dir.mkdir()
+            save_filename = save_dir / v_path.name
+            # save_filename = str(v_path) + '_reduced'
+            if back:
+                save_filename += '_back'
+        else:
+            save_filename = str(Path(save_path) / v_path.name)
+            if back:
+                save_filename += '_back'
+        with open(save_filename, 'w') as f:
+            points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+                               pkl_prefix,
+                               train_info_path=None,
+                               val_info_path=None,
+                               test_info_path=None,
+                               save_path=None,
+                               with_back=False):
+    """Create reduced point clouds for training/validation/testing.
+
+    Args:
+        data_path (str): Path of original data.
+        pkl_prefix (str): Prefix of info files.
+        train_info_path (str | None): Path of training set info.
+            Default: None.
+        val_info_path (str | None): Path of validation set info.
+            Default: None.
+        test_info_path (str | None): Path of test set info.
+            Default: None.
+        save_path (str | None): Path to save reduced point cloud data.
+        with_back (bool): Whether to flip the points to back.
+    """
+    if train_info_path is None:
+        train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+    if val_info_path is None:
+        val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+    if test_info_path is None:
+        test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+    print('create reduced point cloud for training set')
+    _create_reduced_point_cloud(data_path, train_info_path, save_path)
+    print('create reduced point cloud for validation set')
+    _create_reduced_point_cloud(data_path, val_info_path, save_path)
+    print('create reduced point cloud for testing set')
+    _create_reduced_point_cloud(data_path, test_info_path, save_path)
+    if with_back:
+        _create_reduced_point_cloud(
+            data_path, train_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, val_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    kitti_infos = mmcv.load(info_path)
+    cat2Ids = [
+        dict(id=kitti_categories.index(cat_name), name=cat_name)
+        for cat_name in kitti_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    from os import path as osp
+    for info in mmcv.track_iter_progress(kitti_infos):
+        coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+        (height, width,
+         _) = mmcv.imread(osp.join(root_path,
+                                   info['image']['image_path'])).shape
+        coco_2d_dict['images'].append(
+            dict(
+                file_name=info['image']['image_path'],
+                id=info['image']['image_idx'],
+                Tri2v=info['calib']['Tr_imu_to_velo'],
+                Trv2c=info['calib']['Tr_velo_to_cam'],
+                rect=info['calib']['R0_rect'],
+                cam_intrinsic=info['calib']['P2'],
+                width=width,
+                height=height))
+        for coco_info in coco_infos:
+            if coco_info is None:
+                continue
+            # add an empty key for coco format
+            coco_info['segmentation'] = []
+            coco_info['id'] = coco_ann_id
+            coco_2d_dict['annotations'].append(coco_info)
+            coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+    """Get the 2D annotation records for a given info.
+
+    Args:
+        info: Information of the given sample data.
+        occluded: Integer (0, 1, 2, 3) indicating occlusion state: \
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded, \
+            3 = unknown, -1 = DontCare
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+    # Get calibration information
+    P2 = info['calib']['P2']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if 'annos' not in info:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    ann_dicts = info['annos']
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+        sample_data_token = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        loc = loc + dim * (dst - src)
+        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+            / info['calib']['P2'][0, 0]
+        loc_3d = np.copy(loc)
+        loc_3d[0, 0] += offset
+        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        camera_intrinsic = P2
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token,
+                                    info['image']['image_path'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            repro_rec['bbox_cam3d'] = np.concatenate(
+                [loc_3d, dim, rot],
+                axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velo_cam3d'] = -1  # no velocity in KITTI
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = box_np_ops.points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            repro_rec['attribute_name'] = -1  # no attribute in KITTI
+            repro_rec['attribute_id'] = -1
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    key_mapping = {
+        'name': 'category_name',
+        'num_points_in_gt': 'num_lidar_pts',
+        'sample_annotation_token': 'sample_annotation_token',
+        'sample_data_token': 'sample_data_token',
+    }
+
+    for key, value in ann_rec.items():
+        if key in key_mapping.keys():
+            repro_rec[key_mapping[key]] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in kitti_categories:
+        return None
+    cat_name = repro_rec['category_name']
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = kitti_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/model_examples/MapTR/tools/data_converter/kitti_data_utils.py b/model_examples/MapTR/tools/data_converter/kitti_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..01538e065bafe6bbae1ece233d09c87037fe4044
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/kitti_data_utils.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from collections import OrderedDict
+from concurrent import futures as futures
+from os import path as osp
+from pathlib import Path
+from skimage import io
+
+
+def get_image_index_str(img_idx, use_prefix_id=False):
+    if use_prefix_id:
+        return '{:07d}'.format(img_idx)
+    else:
+        return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True,
+                        exist_check=True,
+                        use_prefix_id=False):
+    img_idx_str = get_image_index_str(idx, use_prefix_id)
+    img_idx_str += file_tail
+    prefix = Path(prefix)
+    if training:
+        file_path = Path('training') / info_type / img_idx_str
+    else:
+        file_path = Path('testing') / info_type / img_idx_str
+    if exist_check and not (prefix / file_path).exists():
+        raise ValueError('file not exist: {}'.format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+
+
+def get_image_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='image_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.png', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='label_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_velodyne_path(idx,
+                      prefix,
+                      training=True,
+                      relative_path=True,
+                      exist_check=True,
+                      use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_calib_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_pose_path(idx,
+                  prefix,
+                  training=True,
+                  relative_path=True,
+                  exist_check=True,
+                  use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'pose', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+    annotations['name'] = np.array([x[0] for x in content])
+    num_gt = len(annotations['name'])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+                                    for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+                                          for x in content
+                                          ]).reshape(-1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array([[float(info) for info in x[11:14]]
+                                        for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array([float(x[14])
+                                          for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+    annotations['index'] = np.array(index, dtype=np.int32)
+    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+    return annotations
+
+
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+
+
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    """
+    KITTI annotation format version 2:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 4
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam: ...
+            P2: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 4}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['image_path'] = get_image_path(idx, path, training,
+                                                  relative_path)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+            info['calib'] = calib_info
+
+        if annotations is not None:
+            info['annos'] = annotations
+            add_difficulty_to_annos(info)
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def get_waymo_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         pose=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True,
+                         max_sweeps=5):
+    """
+    Waymo annotation format version like KITTI:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 6
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam0: ...
+            P0: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 6}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path, use_prefix_id=True)
+            points = np.fromfile(
+                Path(path) / pc_info['velodyne_path'], dtype=np.float32)
+            points = np.copy(points).reshape(-1, pc_info['num_features'])
+            info['timestamp'] = np.int64(points[0, -1])
+            # values of the last dim are all the timestamp
+        image_info['image_path'] = get_image_path(
+            idx,
+            path,
+            training,
+            relative_path,
+            info_type='image_0',
+            use_prefix_id=True)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(
+                idx,
+                path,
+                training,
+                relative_path,
+                info_type='label_all',
+                use_prefix_id=True)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P4 = np.array([float(info) for info in lines[4].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+                P4 = _extend_matrix(P4)
+            R0_rect = np.array([
+                float(info) for info in lines[5].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['P4'] = P4
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            info['calib'] = calib_info
+        if pose:
+            pose_path = get_pose_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            info['pose'] = np.loadtxt(pose_path)
+
+        if annotations is not None:
+            info['annos'] = annotations
+            info['annos']['camera_id'] = info['annos'].pop('score')
+            add_difficulty_to_annos(info)
+
+        sweeps = []
+        prev_idx = idx
+        while len(sweeps) < max_sweeps:
+            prev_info = {}
+            prev_idx -= 1
+            prev_info['velodyne_path'] = get_velodyne_path(
+                prev_idx,
+                path,
+                training,
+                relative_path,
+                exist_check=False,
+                use_prefix_id=True)
+            if_prev_exists = osp.exists(
+                Path(path) / prev_info['velodyne_path'])
+            if if_prev_exists:
+                prev_points = np.fromfile(
+                    Path(path) / prev_info['velodyne_path'], dtype=np.float32)
+                prev_points = np.copy(prev_points).reshape(
+                    -1, pc_info['num_features'])
+                prev_info['timestamp'] = np.int64(prev_points[0, -1])
+                prev_pose_path = get_pose_path(
+                    prev_idx,
+                    path,
+                    training,
+                    relative_path=False,
+                    use_prefix_id=True)
+                prev_info['pose'] = np.loadtxt(prev_pose_path)
+                sweeps.append(prev_info)
+            else:
+                break
+        info['sweeps'] = sweeps
+
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+    folder = Path(folder)
+    for anno in annos:
+        image_idx = anno['metadata']['image_idx']
+        label_lines = []
+        for j in range(anno['bbox'].shape[0]):
+            label_dict = {
+                'name': anno['name'][j],
+                'alpha': anno['alpha'][j],
+                'bbox': anno['bbox'][j],
+                'location': anno['location'][j],
+                'dimensions': anno['dimensions'][j],
+                'rotation_y': anno['rotation_y'][j],
+                'score': anno['score'][j],
+            }
+            label_line = kitti_result_line(label_dict)
+            label_lines.append(label_line)
+        label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+        label_str = '\n'.join(label_lines)
+        with open(label_file, 'w') as f:
+            f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for evaluation
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for evaluation
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=np.bool)
+    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos['difficulty'] = np.array(diff, np.int32)
+    return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+    prec_float = '{' + ':.{}f'.format(precision) + '}'
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', 0.0),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError('you must specify a value for {}'.format(key))
+        res_dict[key] = val
+
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError('unknown key. supported key:{}'.format(
+                res_dict.keys()))
+    return ' '.join(res_line)
diff --git a/model_examples/MapTR/tools/data_converter/lyft_converter.py b/model_examples/MapTR/tools/data_converter/lyft_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fc1555a265b04aa3d32fed9d8efb90323943406
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/lyft_converter.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from logging import warning
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmdet3d.datasets import LyftDataset
+from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
+                                 obtain_sensor2top)
+
+lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+                   'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+
+def create_lyft_infos(root_path,
+                      info_prefix,
+                      version='v1.01-train',
+                      max_sweeps=10):
+    """Create info file of lyft dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.01-train'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    available_vers = ['v1.01-train', 'v1.01-test']
+    assert version in available_vers
+    if version == 'v1.01-train':
+        train_scenes = mmcv.list_from_file('data/lyft/train.txt')
+        val_scenes = mmcv.list_from_file('data/lyft/val.txt')
+    elif version == 'v1.01-test':
+        train_scenes = mmcv.list_from_file('data/lyft/test.txt')
+        val_scenes = []
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(lyft)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print(f'test scene: {len(train_scenes)}')
+    else:
+        print(f'train scene: {len(train_scenes)}, \
+                val scene: {len(val_scenes)}')
+    train_lyft_infos, val_lyft_infos = _fill_trainval_infos(
+        lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print(f'test sample: {len(train_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        info_name = f'{info_prefix}_infos_test'
+        info_path = osp.join(root_path, f'{info_name}.pkl')
+        mmcv.dump(data, info_path)
+    else:
+        print(f'train sample: {len(train_lyft_infos)}, \
+                val sample: {len(val_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        train_info_name = f'{info_prefix}_infos_train'
+        info_path = osp.join(root_path, f'{train_info_name}.pkl')
+        mmcv.dump(data, info_path)
+        data['infos'] = val_lyft_infos
+        val_info_name = f'{info_prefix}_infos_val'
+        info_val_path = osp.join(root_path, f'{val_info_name}.pkl')
+        mmcv.dump(data, info_val_path)
+
+
+def _fill_trainval_infos(lyft,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and
+            validation set that will be saved to the info file.
+    """
+    train_lyft_infos = []
+    val_lyft_infos = []
+
+    for sample in mmcv.track_iter_progress(lyft.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = lyft.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token'])
+        abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token)
+        # nuScenes devkit returns more convenient relative paths while
+        # lyft devkit returns absolute paths
+        abs_lidar_path = str(abs_lidar_path)  # absolute path
+        lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1]
+        # relative path
+
+        mmcv.check_file_exist(lidar_path)
+
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = lyft.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                lyft.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in LyftDataset.NameMapping:
+                    names[i] = LyftDataset.NameMapping[names[i]]
+            names = np.array(names)
+
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+
+        if sample['scene_token'] in train_scenes:
+            train_lyft_infos.append(info)
+        else:
+            val_lyft_infos.append(info)
+
+    return train_lyft_infos, val_lyft_infos
+
+
+def export_2d_annotation(root_path, info_path, version):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+    """
+    warning.warn('DeprecationWarning: 2D annotations are not used on the '
+                 'Lyft dataset. The function export_2d_annotation will be '
+                 'deprecated.')
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    lyft_infos = mmcv.load(info_path)['infos']
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=lyft_categories.index(cat_name), name=cat_name)
+        for cat_name in lyft_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(lyft_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                lyft,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'])
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'],
+                    id=cam_info['sample_data_token'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
diff --git a/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py b/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42070490ca49e96a1b076ce2cb98ad4f55f7deaa
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/lyft_data_fixer.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import os
+
+
+def fix_lyft(root_folder='./data/lyft', version='v1.01'):
+    # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000  # noqa
+    lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
+    root_folder = os.path.join(root_folder, f'{version}-train')
+    lidar_path = os.path.join(root_folder, lidar_path)
+    assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
+        f'dataset and make sure {lidar_path} is present.'
+    points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
+    try:
+        points.reshape([-1, 5])
+        print(f'This fix is not required for version {version}.')
+    except ValueError:
+        new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
+        new_points.tofile(lidar_path)
+        print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
+
+
+parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
+parser.add_argument(
+    '--root-folder',
+    type=str,
+    default='./data/lyft',
+    help='specify the root path of Lyft dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.01',
+    help='specify Lyft dataset version')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    fix_lyft(root_folder=args.root_folder, version=args.version)
diff --git a/model_examples/MapTR/tools/data_converter/nuimage_converter.py b/model_examples/MapTR/tools/data_converter/nuimage_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..92be1de3dbd1d57603c73b5c7b2ab443d44c9d11
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/nuimage_converter.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import base64
+import mmcv
+import numpy as np
+from nuimages import NuImages
+from nuimages.utils.utils import mask_decode, name_to_index_mapping
+from os import path as osp
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+NAME_MAPPING = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck',
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Data converter arg parser')
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        default='./data/nuimages',
+        help='specify the root path of dataset')
+    parser.add_argument(
+        '--version',
+        type=str,
+        nargs='+',
+        default=['v1.0-mini'],
+        required=False,
+        help='specify the dataset version')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='./data/nuimages/annotations/',
+        required=False,
+        help='path to save the exported json')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=4,
+        required=False,
+        help='workers to process semantic masks')
+    parser.add_argument('--extra-tag', type=str, default='nuimages')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root):
+    """Get semantic segmentation map for an image.
+
+    Args:
+        nuim (obj:`NuImages`): NuImages dataset object
+        img_info (dict): Meta information of img
+
+    Returns:
+        np.ndarray: Semantic segmentation map of the image
+    """
+    sd_token = img_info['token']
+    image_id = img_info['id']
+    name_to_index = name_to_index_mapping(nuim.category)
+
+    # Get image data.
+    width, height = img_info['width'], img_info['height']
+    semseg_mask = np.zeros((height, width)).astype('uint8')
+
+    # Load stuff / surface regions.
+    surface_anns = [
+        o for o in nuim.surface_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Draw stuff / surface regions.
+    for ann in surface_anns:
+        # Get color and mask.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw mask for semantic segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+    # Load object instances.
+    object_anns = [
+        o for o in nuim.object_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Sort by token to ensure that objects always appear in the
+    # instance mask in the same order.
+    object_anns = sorted(object_anns, key=lambda k: k['token'])
+
+    # Draw object instances.
+    # The 0 index is reserved for background; thus, the instances
+    # should start from index 1.
+    annotations = []
+    for i, ann in enumerate(object_anns, start=1):
+        # Get color, box, mask and name.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw masks for semantic segmentation and instance segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+        if category_name in NAME_MAPPING:
+            cat_name = NAME_MAPPING[category_name]
+            cat_id = cat2id[cat_name]
+
+            x_min, y_min, x_max, y_max = ann['bbox']
+            # encode calibrated instance mask
+            mask_anno = dict()
+            mask_anno['counts'] = base64.b64decode(
+                ann['mask']['counts']).decode()
+            mask_anno['size'] = ann['mask']['size']
+
+            data_anno = dict(
+                image_id=image_id,
+                category_id=cat_id,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=mask_anno,
+                iscrowd=0)
+            annotations.append(data_anno)
+
+    # after process, save semantic masks
+    img_filename = img_info['file_name']
+    seg_filename = img_filename.replace('jpg', 'png')
+    seg_filename = osp.join(seg_root, seg_filename)
+    mmcv.imwrite(semseg_mask, seg_filename)
+    return annotations, np.max(semseg_mask)
+
+
+def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc):
+    print('Process category information')
+    categories = []
+    categories = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    cat2id = {k_v['name']: k_v['id'] for k_v in categories}
+
+    images = []
+    print('Process image meta information...')
+    for sample_info in mmcv.track_iter_progress(nuim.sample_data):
+        if sample_info['is_key_frame']:
+            img_idx = len(images)
+            images.append(
+                dict(
+                    id=img_idx,
+                    token=sample_info['token'],
+                    file_name=sample_info['filename'],
+                    width=sample_info['width'],
+                    height=sample_info['height']))
+
+    seg_root = f'{out_dir}semantic_masks'
+    mmcv.mkdir_or_exist(seg_root)
+    mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated'))
+
+    global process_img_anno
+
+    def process_img_anno(img_info):
+        single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id,
+                                                     out_dir, data_root,
+                                                     seg_root)
+        return single_img_annos, max_cls_id
+
+    print('Process img annotations...')
+    if nproc > 1:
+        outputs = mmcv.track_parallel_progress(
+            process_img_anno, images, nproc=nproc)
+    else:
+        outputs = []
+        for img_info in mmcv.track_iter_progress(images):
+            outputs.append(process_img_anno(img_info))
+
+    # Determine the index of object annotation
+    print('Process annotation information...')
+    annotations = []
+    max_cls_ids = []
+    for single_img_annos, max_cls_id in outputs:
+        max_cls_ids.append(max_cls_id)
+        for img_anno in single_img_annos:
+            img_anno.update(id=len(annotations))
+            annotations.append(img_anno)
+
+    max_cls_id = max(max_cls_ids)
+    print(f'Max ID of class in the semantic map: {max_cls_id}')
+
+    coco_format_json = dict(
+        images=images, annotations=annotations, categories=categories)
+
+    mmcv.mkdir_or_exist(out_dir)
+    out_file = osp.join(out_dir, f'{extra_tag}_{version}.json')
+    print(f'Annotation dumped to {out_file}')
+    mmcv.dump(coco_format_json, out_file)
+
+
+def main():
+    args = parse_args()
+    for version in args.version:
+        nuim = NuImages(
+            dataroot=args.data_root, version=version, verbose=True, lazy=True)
+        export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag,
+                            version, args.nproc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/data_converter/nuscenes_converter.py b/model_examples/MapTR/tools/data_converter/nuscenes_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a1d995cdbafbb731364080be2478ea65eab574e
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/nuscenes_converter.py
@@ -0,0 +1,675 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import mmcv
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmdet3d.core.bbox.box_np_ops import points_cam2img
+from mmdet3d.datasets import NuScenesDataset
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location']
+
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'map_location': map_location,
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmcv.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py b/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b6b773e9c8b19f0df6d1b8c274ccca22c31532
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/s3dis_data_utils.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class S3DISData(object):
+    """S3DIS data.
+
+    Generate s3dis infos for s3dis_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'Area_1'.
+    """
+
+    def __init__(self, root_path, split='Area_1'):
+        self.root_dir = root_path
+        self.split = split
+        self.data_dir = osp.join(root_path,
+                                 'Stanford3dDataset_v1.2_Aligned_Version')
+
+        # Following `GSDN <https://arxiv.org/abs/2006.12356>`_, use 5 furniture
+        # classes for detection: table, chair, sofa, bookcase, board.
+        self.cat_ids = np.array([7, 8, 9, 10, 11])
+        self.cat_ids2class = {
+            cat_id: i
+            for i, cat_id in enumerate(list(self.cat_ids))
+        }
+
+        assert split in [
+            'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6'
+        ]
+        self.sample_id_list = os.listdir(osp.join(self.data_dir,
+                                                  split))  # conferenceRoom_1
+        for sample_id in self.sample_id_list:
+            if os.path.isfile(osp.join(self.data_dir, split, sample_id)):
+                self.sample_id_list.remove(sample_id)
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {
+                'num_features': 6,
+                'lidar_idx': f'{self.split}_{sample_idx}'
+            }
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 's3dis_data',
+                                    f'{self.split}_{sample_idx}_point.npy')
+            pts_instance_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_ins_label.npy')
+            pts_semantic_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_sem_label.npy')
+
+            points = np.load(pts_filename).astype(np.float32)
+            pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int)
+            pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int)
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+            points.tofile(
+                osp.join(self.root_dir, 'points',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_instance_mask.tofile(
+                osp.join(self.root_dir, 'instance_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_semantic_mask.tofile(
+                osp.join(self.root_dir, 'semantic_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+
+            info['pts_path'] = osp.join('points',
+                                        f'{self.split}_{sample_idx}.bin')
+            info['pts_instance_mask_path'] = osp.join(
+                'instance_mask', f'{self.split}_{sample_idx}.bin')
+            info['pts_semantic_mask_path'] = osp.join(
+                'semantic_mask', f'{self.split}_{sample_idx}.bin')
+            info['annos'] = self.get_bboxes(points, pts_instance_mask,
+                                            pts_semantic_mask)
+
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+    def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask):
+        """Convert instance masks to axis-aligned bounding boxes.
+
+        Args:
+            points (np.array): Scene points of shape (n, 6).
+            pts_instance_mask (np.ndarray): Instance labels of shape (n,).
+            pts_semantic_mask (np.ndarray): Semantic labels of shape (n,).
+
+        Returns:
+            dict: A dict containing detection infos with following keys:
+
+                - gt_boxes_upright_depth (np.ndarray): Bounding boxes
+                    of shape (n, 6)
+                - class (np.ndarray): Box labels of shape (n,)
+                - gt_num (int): Number of boxes.
+        """
+        bboxes, labels = [], []
+        for i in range(1, pts_instance_mask.max()):
+            ids = pts_instance_mask == i
+            # check if all instance points have same semantic label
+            assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max()
+            label = pts_semantic_mask[ids][0]
+            # keep only furniture objects
+            if label in self.cat_ids2class:
+                labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]])
+                pts = points[:, :3][ids]
+                min_pts = pts.min(axis=0)
+                max_pts = pts.max(axis=0)
+                locations = (min_pts + max_pts) / 2
+                dimensions = max_pts - min_pts
+                bboxes.append(np.concatenate((locations, dimensions)))
+        annotation = dict()
+        # follow ScanNet and SUN RGB-D keys
+        annotation['gt_boxes_upright_depth'] = np.array(bboxes)
+        annotation['class'] = np.array(labels)
+        annotation['gt_num'] = len(labels)
+        return annotation
+
+
+class S3DISSegData(object):
+    """S3DIS dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='Area_1',
+                 num_points=4096,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        self.num_points = num_points
+
+        self.all_ids = np.arange(13)  # all possible ids
+        self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/model_examples/MapTR/tools/data_converter/scannet_data_utils.py b/model_examples/MapTR/tools/data_converter/scannet_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a437fe01ce6857d016001589203c3da2e802d6aa
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/scannet_data_utils.py
@@ -0,0 +1,293 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class ScanNetData(object):
+    """ScanNet data.
+
+    Generate scannet infos for scannet_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+    """
+
+    def __init__(self, root_path, split='train'):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path)
+        self.classes = [
+            'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+            'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+            'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+            'garbagebin'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.cat_ids = np.array(
+            [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+        self.cat_ids2class = {
+            nyu40id: i
+            for i, nyu40id in enumerate(list(self.cat_ids))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.root_dir, 'meta_data',
+                              f'scannetv2_{split}.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = mmcv.list_from_file(split_file)
+        self.test_mode = (split == 'test')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_aligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_aligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_unaligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_unaligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_axis_align_matrix(self, idx):
+        matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+                               f'{idx}_axis_align_matrix.npy')
+        mmcv.check_file_exist(matrix_file)
+        return np.load(matrix_file)
+
+    def get_images(self, idx):
+        paths = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.jpg'):
+                paths.append(osp.join('posed_images', idx, file))
+        return paths
+
+    def get_extrinsics(self, idx):
+        extrinsics = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.txt') and not file == 'intrinsic.txt':
+                extrinsics.append(np.loadtxt(osp.join(path, file)))
+        return extrinsics
+
+    def get_intrinsics(self, idx):
+        matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+                               'intrinsic.txt')
+        mmcv.check_file_exist(matrix_file)
+        return np.loadtxt(matrix_file)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_vert.npy')
+            points = np.load(pts_filename)
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            points.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx}.bin'))
+            info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+            # update with RGB image paths if exist
+            if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+                info['intrinsics'] = self.get_intrinsics(sample_idx)
+                all_extrinsics = self.get_extrinsics(sample_idx)
+                all_img_paths = self.get_images(sample_idx)
+                # some poses in ScanNet are invalid
+                extrinsics, img_paths = [], []
+                for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+                    if np.all(np.isfinite(extrinsic)):
+                        img_paths.append(img_path)
+                        extrinsics.append(extrinsic)
+                info['extrinsics'] = extrinsics
+                info['img_paths'] = img_paths
+
+            if not self.test_mode:
+                pts_instance_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_ins_label.npy')
+                pts_semantic_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_sem_label.npy')
+
+                pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                    np.long)
+                pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                    np.long)
+
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+                pts_instance_mask.tofile(
+                    osp.join(self.root_dir, 'instance_mask',
+                             f'{sample_idx}.bin'))
+                pts_semantic_mask.tofile(
+                    osp.join(self.root_dir, 'semantic_mask',
+                             f'{sample_idx}.bin'))
+
+                info['pts_instance_mask_path'] = osp.join(
+                    'instance_mask', f'{sample_idx}.bin')
+                info['pts_semantic_mask_path'] = osp.join(
+                    'semantic_mask', f'{sample_idx}.bin')
+
+            if has_label:
+                annotations = {}
+                # box is of shape [k, 6 + class]
+                aligned_box_label = self.get_aligned_box_label(sample_idx)
+                unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+                annotations['gt_num'] = aligned_box_label.shape[0]
+                if annotations['gt_num'] != 0:
+                    aligned_box = aligned_box_label[:, :-1]  # k, 6
+                    unaligned_box = unaligned_box_label[:, :-1]
+                    classes = aligned_box_label[:, -1]  # k
+                    annotations['name'] = np.array([
+                        self.label2cat[self.cat_ids2class[classes[i]]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                    # default names are given to aligned bbox for compatibility
+                    # we also save unaligned bbox info with marked names
+                    annotations['location'] = aligned_box[:, :3]
+                    annotations['dimensions'] = aligned_box[:, 3:6]
+                    annotations['gt_boxes_upright_depth'] = aligned_box
+                    annotations['unaligned_location'] = unaligned_box[:, :3]
+                    annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+                    annotations[
+                        'unaligned_gt_boxes_upright_depth'] = unaligned_box
+                    annotations['index'] = np.arange(
+                        annotations['gt_num'], dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat_ids2class[classes[i]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+                annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+
+class ScanNetSegData(object):
+    """ScanNet dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='train',
+                 num_points=8192,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        assert split in ['train', 'val', 'test']
+        self.num_points = num_points
+
+        self.all_ids = np.arange(41)  # all possible ids
+        self.cat_ids = np.array([
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36,
+            39
+        ])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        if self.split == 'test':
+            return
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py b/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8a502e901a1ce9f066f8a1546d9d4f4b787b89
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/sunrgbd_data_utils.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from concurrent import futures as futures
+from os import path as osp
+from scipy import io as sio
+
+
+def random_sampling(points, num_points, replace=None, return_choices=False):
+    """Random sampling.
+
+    Sampling point cloud to a certain number of points.
+
+    Args:
+        points (ndarray): Point cloud.
+        num_points (int): The number of samples.
+        replace (bool): Whether the sample is with or without replacement.
+        return_choices (bool): Whether to return choices.
+
+    Returns:
+        points (ndarray): Point cloud after sampling.
+    """
+
+    if replace is None:
+        replace = (points.shape[0] < num_points)
+    choices = np.random.choice(points.shape[0], num_points, replace=replace)
+    if return_choices:
+        return points[choices], choices
+    else:
+        return points[choices]
+
+
+class SUNRGBDInstance(object):
+
+    def __init__(self, line):
+        data = line.split(' ')
+        data[1:] = [float(x) for x in data[1:]]
+        self.classname = data[0]
+        self.xmin = data[1]
+        self.ymin = data[2]
+        self.xmax = data[1] + data[3]
+        self.ymax = data[2] + data[4]
+        self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax])
+        self.centroid = np.array([data[5], data[6], data[7]])
+        self.w = data[8]
+        self.l = data[9]  # noqa: E741
+        self.h = data[10]
+        self.orientation = np.zeros((3, ))
+        self.orientation[0] = data[11]
+        self.orientation[1] = data[12]
+        self.heading_angle = -1 * np.arctan2(self.orientation[1],
+                                             self.orientation[0])
+        self.box3d = np.concatenate([
+            self.centroid,
+            np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle])
+        ])
+
+
+class SUNRGBDData(object):
+    """SUNRGBD data.
+
+    Generate scannet infos for sunrgbd_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+        use_v1 (bool): Whether to use v1. Default: False.
+    """
+
+    def __init__(self, root_path, split='train', use_v1=False):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path, 'sunrgbd_trainval')
+        self.classes = [
+            'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+            'night_stand', 'bookshelf', 'bathtub'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {
+            label: self.classes[label]
+            for label in range(len(self.classes))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.split_dir, f'{split}_data_idx.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = map(int, mmcv.list_from_file(split_file))
+        self.image_dir = osp.join(self.split_dir, 'image')
+        self.calib_dir = osp.join(self.split_dir, 'calib')
+        self.depth_dir = osp.join(self.split_dir, 'depth')
+        if use_v1:
+            self.label_dir = osp.join(self.split_dir, 'label_v1')
+        else:
+            self.label_dir = osp.join(self.split_dir, 'label')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_image(self, idx):
+        img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg')
+        return mmcv.imread(img_filename)
+
+    def get_image_shape(self, idx):
+        image = self.get_image(idx)
+        return np.array(image.shape[:2], dtype=np.int32)
+
+    def get_depth(self, idx):
+        depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat')
+        depth = sio.loadmat(depth_filename)['instance']
+        return depth
+
+    def get_calibration(self, idx):
+        calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(calib_filepath)]
+        Rt = np.array([float(x) for x in lines[0].split(' ')])
+        Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32)
+        K = np.array([float(x) for x in lines[1].split(' ')])
+        K = np.reshape(K, (3, 3), order='F').astype(np.float32)
+        return K, Rt
+
+    def get_label_objects(self, idx):
+        label_filename = osp.join(self.label_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(label_filename)]
+        objects = [SUNRGBDInstance(line) for line in lines]
+        return objects
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            # convert depth to points
+            SAMPLE_NUM = 50000
+            # TODO: Check whether can move the point
+            #  sampling process during training.
+            pc_upright_depth = self.get_depth(sample_idx)
+            pc_upright_depth_subsampled = random_sampling(
+                pc_upright_depth, SAMPLE_NUM)
+
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            pc_upright_depth_subsampled.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin'))
+
+            info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin')
+            img_path = osp.join('image', f'{sample_idx:06d}.jpg')
+            image_info = {
+                'image_idx': sample_idx,
+                'image_shape': self.get_image_shape(sample_idx),
+                'image_path': img_path
+            }
+            info['image'] = image_info
+
+            K, Rt = self.get_calibration(sample_idx)
+            calib_info = {'K': K, 'Rt': Rt}
+            info['calib'] = calib_info
+
+            if has_label:
+                obj_list = self.get_label_objects(sample_idx)
+                annotations = {}
+                annotations['gt_num'] = len([
+                    obj.classname for obj in obj_list
+                    if obj.classname in self.cat2label.keys()
+                ])
+                if annotations['gt_num'] != 0:
+                    annotations['name'] = np.array([
+                        obj.classname for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['bbox'] = np.concatenate([
+                        obj.box2d.reshape(1, 4) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                         axis=0)
+                    annotations['location'] = np.concatenate([
+                        obj.centroid.reshape(1, 3) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                             axis=0)
+                    annotations['dimensions'] = 2 * np.array([
+                        [obj.l, obj.w, obj.h] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])  # lwh (depth) format
+                    annotations['rotation_y'] = np.array([
+                        obj.heading_angle for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['index'] = np.arange(
+                        len(obj_list), dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat2label[obj.classname] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['gt_boxes_upright_depth'] = np.stack(
+                        [
+                            obj.box3d for obj in obj_list
+                            if obj.classname in self.cat2label.keys()
+                        ],
+                        axis=0)  # (K,8)
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if \
+            sample_id_list is not None else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
diff --git a/model_examples/MapTR/tools/data_converter/waymo_converter.py b/model_examples/MapTR/tools/data_converter/waymo_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..94fcae1a3795f3a1e5ff8a31dfa238948ac9e87f
--- /dev/null
+++ b/model_examples/MapTR/tools/data_converter/waymo_converter.py
@@ -0,0 +1,519 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2
+except ImportError:
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-2-0==1.2.0" '
+        'to install the official devkit first.')
+
+import mmcv
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset.utils import range_image_utils, transform_utils
+from waymo_open_dataset.utils.frame_utils import \
+    parse_range_image_and_camera_projection
+
+
+class Waymo2KITTI(object):
+    """Waymo to KITTI converter.
+
+    This class serves as the converter to change the waymo raw data to KITTI
+    format.
+
+    Args:
+        load_dir (str): Directory to load waymo raw data.
+        save_dir (str): Directory to save data in KITTI format.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of workers for the parallel process.
+        test_mode (bool): Whether in the test_mode. Default: False.
+    """
+
+    def __init__(self,
+                 load_dir,
+                 save_dir,
+                 prefix,
+                 workers=64,
+                 test_mode=False):
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+
+        # Only data collected in specific locations will be converted
+        # If set None, this filter is disabled
+        # Available options: location_sf (main dataset)
+        self.selected_waymo_locations = None
+        self.save_track_id = False
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.lidar_list = [
+            '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT',
+            '_SIDE_LEFT'
+        ]
+        self.type_list = [
+            'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
+        ]
+        self.waymo_to_kitti_class_map = {
+            'UNKNOWN': 'DontCare',
+            'PEDESTRIAN': 'Pedestrian',
+            'VEHICLE': 'Car',
+            'CYCLIST': 'Cyclist',
+            'SIGN': 'Sign'  # not in kitti
+        }
+
+        self.load_dir = load_dir
+        self.save_dir = save_dir
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.test_mode = test_mode
+
+        self.tfrecord_pathnames = sorted(
+            glob(join(self.load_dir, '*.tfrecord')))
+
+        self.label_save_dir = f'{self.save_dir}/label_'
+        self.label_all_save_dir = f'{self.save_dir}/label_all'
+        self.image_save_dir = f'{self.save_dir}/image_'
+        self.calib_save_dir = f'{self.save_dir}/calib'
+        self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
+        self.pose_save_dir = f'{self.save_dir}/pose'
+
+        self.create_folder()
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        pathname = self.tfrecord_pathnames[file_idx]
+        dataset = tf.data.TFRecordDataset(pathname, compression_type='')
+
+        for frame_idx, data in enumerate(dataset):
+
+            if frame_idx % 5 != 0:
+                continue
+            # print(frame_idx)
+            frame = dataset_pb2.Frame()
+            frame.ParseFromString(bytearray(data.numpy()))
+            if (self.selected_waymo_locations is not None
+                    and frame.context.stats.location
+                    not in self.selected_waymo_locations):
+                continue
+
+            self.save_image(frame, file_idx, frame_idx)
+            self.save_calib(frame, file_idx, frame_idx)
+            self.save_lidar(frame, file_idx, frame_idx)
+            self.save_pose(frame, file_idx, frame_idx)
+
+            if not self.test_mode:
+                self.save_label(frame, file_idx, frame_idx)
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.tfrecord_pathnames)
+
+    def save_image(self, frame, file_idx, frame_idx):
+        """Parse and save the images in png format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        for img in frame.images:
+            img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \
+                f'{self.prefix}{str(file_idx).zfill(3)}' + \
+                f'{str(frame_idx).zfill(3)}.png'
+            img = mmcv.imfrombytes(img.image)
+            mmcv.imwrite(img, img_path)
+
+    def save_calib(self, frame, file_idx, frame_idx):
+        """Parse and save the calibration data.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
+        Tr_velo_to_cams = []
+        calib_context = ''
+
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
+                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
+            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
+            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calib = list(camera_calib.reshape(12))
+            camera_calib = [f'{i:e}' for i in camera_calib]
+            camera_calibs.append(camera_calib)
+
+        # all camera ids are saved as id-1 in the result because
+        # camera 0 is unknown in the proto
+        for i in range(5):
+            calib_context += 'P' + str(i) + ': ' + \
+                ' '.join(camera_calibs[i]) + '\n'
+        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
+        for i in range(5):
+            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
+                ' '.join(Tr_velo_to_cams[i]) + '\n'
+
+        with open(
+                f'{self.calib_save_dir}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
+                'w+') as fp_calib:
+            fp_calib.write(calib_context)
+            fp_calib.close()
+
+    def save_lidar(self, frame, file_idx, frame_idx):
+        """Parse and save the lidar data in psd format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        range_images, camera_projections, range_image_top_pose = \
+            parse_range_image_and_camera_projection(frame)
+
+        # First return
+        points_0, cp_points_0, intensity_0, elongation_0 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=0
+            )
+        points_0 = np.concatenate(points_0, axis=0)
+        intensity_0 = np.concatenate(intensity_0, axis=0)
+        elongation_0 = np.concatenate(elongation_0, axis=0)
+
+        # Second return
+        points_1, cp_points_1, intensity_1, elongation_1 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=1
+            )
+        points_1 = np.concatenate(points_1, axis=0)
+        intensity_1 = np.concatenate(intensity_1, axis=0)
+        elongation_1 = np.concatenate(elongation_1, axis=0)
+
+        points = np.concatenate([points_0, points_1], axis=0)
+        intensity = np.concatenate([intensity_0, intensity_1], axis=0)
+        elongation = np.concatenate([elongation_0, elongation_1], axis=0)
+        timestamp = frame.timestamp_micros * np.ones_like(intensity)
+
+        # concatenate x,y,z, intensity, elongation, timestamp (6-dim)
+        point_cloud = np.column_stack(
+            (points, intensity, elongation, timestamp))
+
+        pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
+        point_cloud.astype(np.float32).tofile(pc_path)
+
+    def save_label(self, frame, file_idx, frame_idx):
+        """Parse and save the label data in txt format.
+        The relation between waymo and kitti coordinates is noteworthy:
+        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
+        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
+        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
+        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        fp_label_all = open(
+            f'{self.label_all_save_dir}/{self.prefix}' +
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+')
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        for obj in frame.laser_labels:
+            bounding_box = None
+            name = None
+            id = obj.id
+            for lidar in self.lidar_list:
+                if id + lidar in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + lidar)
+                    name = str(id_to_name.get(id + lidar))
+                    break
+
+            if bounding_box is None or name is None:
+                name = '0'
+                bounding_box = (0, 0, 0, 0)
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            my_type = self.waymo_to_kitti_class_map[my_type]
+
+            height = obj.box.height
+            width = obj.box.width
+            length = obj.box.length
+
+            x = obj.box.center_x
+            y = obj.box.center_y
+            z = obj.box.center_z - height / 2
+
+            # project bounding box to the virtual reference frame
+            pt_ref = self.T_velo_to_front_cam @ \
+                np.array([x, y, z, 1]).reshape((4, 1))
+            x, y, z, _ = pt_ref.flatten().tolist()
+
+            rotation_y = -obj.box.heading - np.pi / 2
+            track_id = obj.id
+
+            # not available
+            truncated = 0
+            occluded = 0
+            alpha = -10
+
+            line = my_type + \
+                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
+                    round(truncated, 2), occluded, round(alpha, 2),
+                    round(bounding_box[0], 2), round(bounding_box[1], 2),
+                    round(bounding_box[2], 2), round(bounding_box[3], 2),
+                    round(height, 2), round(width, 2), round(length, 2),
+                    round(x, 2), round(y, 2), round(z, 2),
+                    round(rotation_y, 2))
+
+            if self.save_track_id:
+                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
+            else:
+                line_all = line[:-1] + ' ' + name + '\n'
+
+            fp_label = open(
+                f'{self.label_save_dir}{name}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a')
+            fp_label.write(line)
+            fp_label.close()
+
+            fp_label_all.write(line_all)
+
+        fp_label_all.close()
+
+    def save_pose(self, frame, file_idx, frame_idx):
+        """Parse and save the pose data.
+
+        Note that SDC's own pose is not included in the regular training
+        of KITTI dataset. KITTI raw dataset contains ego motion files
+        but are not often used. Pose is important for algorithms that
+        take advantage of the temporal information.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        pose = np.array(frame.pose.transform).reshape(4, 4)
+        np.savetxt(
+            join(f'{self.pose_save_dir}/{self.prefix}' +
+                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
+            pose)
+
+    def create_folder(self):
+        """Create folder for data preprocessing."""
+        if not self.test_mode:
+            dir_list1 = [
+                self.label_all_save_dir, self.calib_save_dir,
+                self.point_cloud_save_dir, self.pose_save_dir
+            ]
+            dir_list2 = [self.label_save_dir, self.image_save_dir]
+        else:
+            dir_list1 = [
+                self.calib_save_dir, self.point_cloud_save_dir,
+                self.pose_save_dir
+            ]
+            dir_list2 = [self.image_save_dir]
+        for d in dir_list1:
+            mmcv.mkdir_or_exist(d)
+        for d in dir_list2:
+            for i in range(5):
+                mmcv.mkdir_or_exist(f'{d}{str(i)}')
+
+    def convert_range_image_to_point_cloud(self,
+                                           frame,
+                                           range_images,
+                                           camera_projections,
+                                           range_image_top_pose,
+                                           ri_index=0):
+        """Convert range images to point cloud.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame.
+            range_images (dict): Mapping from laser_name to list of two
+                range images corresponding with two returns.
+            camera_projections (dict): Mapping from laser_name to list of two
+                camera projections corresponding with two returns.
+            range_image_top_pose (:obj:`Transform`): Range image pixel pose for
+                top lidar.
+            ri_index (int): 0 for the first return, 1 for the second return.
+                Default: 0.
+
+        Returns:
+            tuple[list[np.ndarray]]: (List of points with shape [N, 3],
+                camera projections of points with shape [N, 6], intensity
+                with shape [N, 1], elongation with shape [N, 1]). All the
+                lists have the length of lidar numbers (5).
+        """
+        calibrations = sorted(
+            frame.context.laser_calibrations, key=lambda c: c.name)
+        points = []
+        cp_points = []
+        intensity = []
+        elongation = []
+
+        frame_pose = tf.convert_to_tensor(
+            value=np.reshape(np.array(frame.pose.transform), [4, 4]))
+        # [H, W, 6]
+        range_image_top_pose_tensor = tf.reshape(
+            tf.convert_to_tensor(value=range_image_top_pose.data),
+            range_image_top_pose.shape.dims)
+        # [H, W, 3, 3]
+        range_image_top_pose_tensor_rotation = \
+            transform_utils.get_rotation_matrix(
+                range_image_top_pose_tensor[..., 0],
+                range_image_top_pose_tensor[..., 1],
+                range_image_top_pose_tensor[..., 2])
+        range_image_top_pose_tensor_translation = \
+            range_image_top_pose_tensor[..., 3:]
+        range_image_top_pose_tensor = transform_utils.get_transform(
+            range_image_top_pose_tensor_rotation,
+            range_image_top_pose_tensor_translation)
+        for c in calibrations:
+            range_image = range_images[c.name][ri_index]
+            if len(c.beam_inclinations) == 0:
+                beam_inclinations = range_image_utils.compute_inclination(
+                    tf.constant(
+                        [c.beam_inclination_min, c.beam_inclination_max]),
+                    height=range_image.shape.dims[0])
+            else:
+                beam_inclinations = tf.constant(c.beam_inclinations)
+
+            beam_inclinations = tf.reverse(beam_inclinations, axis=[-1])
+            extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4])
+
+            range_image_tensor = tf.reshape(
+                tf.convert_to_tensor(value=range_image.data),
+                range_image.shape.dims)
+            pixel_pose_local = None
+            frame_pose_local = None
+            if c.name == dataset_pb2.LaserName.TOP:
+                pixel_pose_local = range_image_top_pose_tensor
+                pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0)
+                frame_pose_local = tf.expand_dims(frame_pose, axis=0)
+            range_image_mask = range_image_tensor[..., 0] > 0
+
+            if self.filter_no_label_zone_points:
+                nlz_mask = range_image_tensor[..., 3] != 1.0  # 1.0: in NLZ
+                range_image_mask = range_image_mask & nlz_mask
+
+            range_image_cartesian = \
+                range_image_utils.extract_point_cloud_from_range_image(
+                    tf.expand_dims(range_image_tensor[..., 0], axis=0),
+                    tf.expand_dims(extrinsic, axis=0),
+                    tf.expand_dims(tf.convert_to_tensor(
+                        value=beam_inclinations), axis=0),
+                    pixel_pose=pixel_pose_local,
+                    frame_pose=frame_pose_local)
+
+            range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0)
+            points_tensor = tf.gather_nd(range_image_cartesian,
+                                         tf.compat.v1.where(range_image_mask))
+
+            cp = camera_projections[c.name][ri_index]
+            cp_tensor = tf.reshape(
+                tf.convert_to_tensor(value=cp.data), cp.shape.dims)
+            cp_points_tensor = tf.gather_nd(
+                cp_tensor, tf.compat.v1.where(range_image_mask))
+            points.append(points_tensor.numpy())
+            cp_points.append(cp_points_tensor.numpy())
+
+            intensity_tensor = tf.gather_nd(range_image_tensor[..., 1],
+                                            tf.where(range_image_mask))
+            intensity.append(intensity_tensor.numpy())
+
+            elongation_tensor = tf.gather_nd(range_image_tensor[..., 2],
+                                             tf.where(range_image_mask))
+            elongation.append(elongation_tensor.numpy())
+
+        return points, cp_points, intensity, elongation
+
+    def cart_to_homo(self, mat):
+        """Convert transformation matrix in Cartesian coordinates to
+        homogeneous format.
+
+        Args:
+            mat (np.ndarray): Transformation matrix in Cartesian.
+                The input matrix shape is 3x3 or 3x4.
+
+        Returns:
+            np.ndarray: Transformation matrix in homogeneous format.
+                The matrix shape is 4x4.
+        """
+        ret = np.eye(4)
+        if mat.shape == (3, 3):
+            ret[:3, :3] = mat
+        elif mat.shape == (3, 4):
+            ret[:3, :] = mat
+        else:
+            raise ValueError(mat.shape)
+        return ret
diff --git a/model_examples/MapTR/tools/dist_test.sh b/model_examples/MapTR/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0fb6a1ca62072644ee98e35cf35b8753ba15a8a1
--- /dev/null
+++ b/model_examples/MapTR/tools/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/model_examples/MapTR/tools/dist_test_map.sh b/model_examples/MapTR/tools/dist_test_map.sh
new file mode 100644
index 0000000000000000000000000000000000000000..01ee3b73417caa6230bdcd47f25614c728db7c38
--- /dev/null
+++ b/model_examples/MapTR/tools/dist_test_map.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval chamfer
diff --git a/model_examples/MapTR/tools/dist_train.sh b/model_examples/MapTR/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ea94f6bf4e8ee27dc8d9fa62ea4d8ce982408d78
--- /dev/null
+++ b/model_examples/MapTR/tools/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/model_examples/MapTR/tools/fp16/dist_train.sh b/model_examples/MapTR/tools/fp16/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4ac9a15c86100e210feeb6fb9432d88c34492d51
--- /dev/null
+++ b/model_examples/MapTR/tools/fp16/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28508}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/model_examples/MapTR/tools/fp16/train.py b/model_examples/MapTR/tools/fp16/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f0270b4ac45fd8aff3f70ba08a08f513f3b4da
--- /dev/null
+++ b/model_examples/MapTR/tools/fp16/train.py
@@ -0,0 +1,271 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            
+            from projects.mmdet3d_plugin.bevformer.apis import custom_train_model
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    #if args.resume_from is not None:
+
+    if args.resume_from is not None and osp.isfile(args.resume_from): 
+        cfg.resume_from = args.resume_from
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) != digit_version('1.8.1'):
+        cfg.optimizer['type'] = 'AdamW'
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        assert False, 'DOT NOT SUPPORT!!!'
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    eval_model_config = copy.deepcopy(cfg.model)
+    eval_model = build_model(
+        eval_model_config,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(eval_model)
+
+    #eval_model.init_weights()
+    eval_model.load_state_dict(model.state_dict())
+
+    logger.info(f'Model:\n{model}')
+    from projects.mmdet3d_plugin.datasets import custom_build_dataset
+    datasets = [custom_build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(custom_build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            mmseg_version=mmseg_version,
+            mmdet3d_version=mmdet3d_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        eval_model=eval_model,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/maptr/benchmark.py b/model_examples/MapTR/tools/maptr/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c9f01d2e032702a4572eae6f659c98b094c5878
--- /dev/null
+++ b/model_examples/MapTR/tools/maptr/benchmark.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+import os 
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('***********number of params:', n_parameters)
+
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/maptr/generate_papervis.py b/model_examples/MapTR/tools/maptr/generate_papervis.py
new file mode 100644
index 0000000000000000000000000000000000000000..79c884b369fddd44ab28697c546954af19d1b637
--- /dev/null
+++ b/model_examples/MapTR/tools/maptr/generate_papervis.py
@@ -0,0 +1,104 @@
+import os.path as osp
+import argparse
+import os
+import glob
+import cv2
+import mmcv
+CAMS = ['FRONT_LEFT','FRONT','FRONT_RIGHT',
+             'BACK_LEFT','BACK','BACK_RIGHT',]
+VIEWS_NAME = 'surroud_view.jpg'
+GT_MAP_NAME = 'GT_fixednum_pts_MAP.png'
+PRED_MAP_NAME = 'PRED_MAP_plot.png'
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='vis hdmaptr map gt label')
+    parser.add_argument('visdir', help='visualize directory')
+    # parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--sample-name', default='SAMPLE_VIS.png', type=str)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    parent_dir = osp.join(args.visdir,'..')
+    vis_subdir_list = []
+
+    file_list = os.listdir(args.visdir)
+    prog_bar = mmcv.ProgressBar(len(file_list))
+    for file in file_list:
+        file_path = osp.join(args.visdir, file) 
+        if os.path.isdir(file_path):
+            vis_subdir_list.append(file_path)
+            sample_path = osp.join(file_path,args.sample_name)
+            row_1_list = []
+            for cam in CAMS[:3]:
+                cam_img_name = 'CAM_'+ cam + '.jpg'
+                cam_img = cv2.imread(osp.join(file_path, cam_img_name))
+                # import pdb;pdb.set_trace()
+                lw = 8
+                tf = max(lw - 1, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
+                p1 = (0,0)
+                p2 = (w,h+3)
+                color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA)  # filled
+                cv2.putText(cam_img,
+                            cam, (p1[0], p1[1] + h + 2),
+                            0,
+                            lw / 3,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                row_1_list.append(cam_img)
+            row_2_list = []
+            for cam in CAMS[3:]:
+                cam_img_name = 'CAM_'+ cam + '.jpg'
+                cam_img = cv2.imread(osp.join(file_path, cam_img_name))
+                if cam == 'BACK':
+                    cam_img = cv2.flip(cam_img, 1)
+                # import pdb;pdb.set_trace()
+                lw = 8
+                tf = max(lw - 1, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
+                p1 = (0,0)
+                p2 = (w,h+3)
+                color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA)  # filled
+                cv2.putText(cam_img,
+                            cam, (p1[0], p1[1] + h + 2),
+                            0,
+                            lw / 3,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                row_2_list.append(cam_img)
+            row_1_img=cv2.hconcat(row_1_list)
+            row_2_img=cv2.hconcat(row_2_list)
+            cams_img = cv2.vconcat([row_1_img,row_2_img])
+
+
+            map_img = cv2.imread(osp.join(file_path,PRED_MAP_NAME))
+            gt_map_img = cv2.imread(osp.join(file_path,GT_MAP_NAME))
+            map_img = cv2.copyMakeBorder(map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+            gt_map_img = cv2.copyMakeBorder(gt_map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+
+            cams_h,cam_w,_ = cams_img.shape
+            map_h,map_w,_ = map_img.shape
+            resize_ratio = cams_h / map_h
+            resized_w = map_w * resize_ratio
+            resized_map_img = cv2.resize(map_img,(int(resized_w),int(cams_h)))
+            resized_gt_map_img = cv2.resize(gt_map_img,(int(resized_w),int(cams_h)))
+
+
+
+            sample_img = cv2.hconcat([cams_img, resized_map_img,resized_gt_map_img])
+            cv2.imwrite(sample_path, sample_img)
+        prog_bar.update()
+    print('DONE!')
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/model_examples/MapTR/tools/maptr/generate_video.py b/model_examples/MapTR/tools/maptr/generate_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..145af87c137d2979d770734064bd0e6e06dd8db3
--- /dev/null
+++ b/model_examples/MapTR/tools/maptr/generate_video.py
@@ -0,0 +1,131 @@
+import os.path as osp
+import argparse
+import os
+import glob
+import cv2
+import mmcv
+CAMS = ['FRONT_LEFT','FRONT','FRONT_RIGHT',
+             'BACK_LEFT','BACK','BACK_RIGHT',]
+GT_MAP_NAME = 'GT_fixednum_pts_MAP.png'
+PRED_MAP_NAME = 'PRED_MAP_plot.png'
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='vis hdmaptr map gt label')
+    parser.add_argument('visdir', help='visualize directory')
+    # parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--fps', default=10, type=int, help='fps to generate video')
+    parser.add_argument('--video-name', default='demo',type=str)
+    parser.add_argument('--sample-name', default='SAMPLE_VIS.jpg', type=str)
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    parent_dir = osp.join(args.visdir,'..')
+    vis_subdir_list = []
+    # import pdb;pdb.set_trace()
+    size = (1680,450)
+    # fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    # fourcc = cv2.VideoWriter_fourcc(*'MP4V')
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_path = osp.join(parent_dir,'%s.mp4' % args.video_name)
+    video = cv2.VideoWriter(video_path, fourcc, args.fps, size, True)
+    file_list = os.listdir(args.visdir)
+    prog_bar = mmcv.ProgressBar(len(file_list))
+    for file in file_list:
+        file_path = osp.join(args.visdir, file) 
+        if os.path.isdir(file_path):
+            vis_subdir_list.append(file_path)
+            sample_path = osp.join(file_path,args.sample_name)
+            row_1_list = []
+            for cam in CAMS[:3]:
+                cam_img_name = 'CAM_'+ cam + '.jpg'
+                cam_img = cv2.imread(osp.join(file_path, cam_img_name))
+                lw = 8
+                tf = max(lw - 1, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
+                p1 = (0,0)
+                p2 = (w,h+3)
+                color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA)  # filled
+                cv2.putText(cam_img,
+                            cam, (p1[0], p1[1] + h + 2),
+                            0,
+                            lw / 3,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                row_1_list.append(cam_img)
+            row_2_list = []
+            for cam in CAMS[3:]:
+                cam_img_name = 'CAM_'+cam + '.jpg'
+                cam_img = cv2.imread(osp.join(file_path, cam_img_name))
+                if cam == 'BACK':
+                    cam_img = cv2.flip(cam_img, 1)
+                lw = 8
+                tf = max(lw - 1, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 3, thickness=tf)[0]  # text width, height
+                p1 = (0,0)
+                p2 = (w,h+3)
+                color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.rectangle(cam_img, p1, p2, color, -1, cv2.LINE_AA)  # filled
+                cv2.putText(cam_img,
+                            cam, (p1[0], p1[1] + h + 2),
+                            0,
+                            lw / 3,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                row_2_list.append(cam_img)
+            row_1_img=cv2.hconcat(row_1_list)
+            row_2_img=cv2.hconcat(row_2_list)
+            cams_img = cv2.vconcat([row_1_img,row_2_img])
+
+            map_img = cv2.imread(osp.join(file_path,PRED_MAP_NAME))
+            gt_map_img = cv2.imread(osp.join(file_path,GT_MAP_NAME))
+
+            map_img = cv2.copyMakeBorder(map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+            gt_map_img = cv2.copyMakeBorder(gt_map_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+
+
+            cams_h,cam_w,_ = cams_img.shape
+            map_h,map_w,_ = map_img.shape
+            resize_ratio = cams_h / map_h
+            resized_w = map_w * resize_ratio
+            resized_map_img = cv2.resize(map_img,(int(resized_w),int(cams_h)))
+            resized_gt_map_img = cv2.resize(gt_map_img,(int(resized_w),int(cams_h)))
+
+            # font
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            # fontScale
+            fontScale = 2
+            # Line thickness of 2 px
+            thickness = 5
+            # org
+            org = (20, 50)      
+            # Blue color in BGR
+            color = (0, 0, 255)
+            # Using cv2.putText() method
+            resized_map_img = cv2.putText(resized_map_img, 'PRED', org, font, 
+                            fontScale, color, thickness, cv2.LINE_AA)
+            resized_gt_map_img = cv2.putText(resized_gt_map_img, 'GT', org, font, 
+                            fontScale, color, thickness, cv2.LINE_AA)
+            
+            sample_img = cv2.hconcat([cams_img, resized_map_img, resized_gt_map_img])
+            cv2.imwrite(sample_path, sample_img,[cv2.IMWRITE_JPEG_QUALITY, 70])
+            # import pdb;pdb.set_trace()
+            resized_img = cv2.resize(sample_img,size)
+
+            video.write(resized_img)
+        prog_bar.update()
+    # import pdb;pdb.set_trace()
+    video.release()
+
+    cv2.destroyAllWindows()
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/model_examples/MapTR/tools/maptr/test.py b/model_examples/MapTR/tools/maptr/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a73e83ac8396424e93f8ac33d5f9ec12c64350e
--- /dev/null
+++ b/model_examples/MapTR/tools/maptr/test.py
@@ -0,0 +1,220 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import mmcv
+import os
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet3d.apis import single_gpu_test
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from mmdet3d.models import build_model
+from mmdet.apis import set_random_seed
+from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test
+from mmdet.datasets import replace_ImageToTensor
+import time
+import os.path as osp
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('respath', help='respath file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both specified, '
+            '--options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            kwargs = {}
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+            print(dataset._evaluate_single(args.respath, metric=args.eval))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/maptr/utils/__init__.py b/model_examples/MapTR/tools/maptr/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/model_examples/MapTR/tools/maptr/vis_pred.py b/model_examples/MapTR/tools/maptr/vis_pred.py
new file mode 100644
index 0000000000000000000000000000000000000000..84ccbb321b14c3fd7bffc8430ca60cf36a51c25f
--- /dev/null
+++ b/model_examples/MapTR/tools/maptr/vis_pred.py
@@ -0,0 +1,394 @@
+import argparse
+import mmcv
+import os
+import shutil
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet3d.apis import single_gpu_test
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from mmdet3d.models import build_model
+from mmdet.apis import set_random_seed
+from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test
+from mmdet.datasets import replace_ImageToTensor
+import time
+import os.path as osp
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from matplotlib import transforms
+from matplotlib.patches import Rectangle
+import cv2
+
+CAMS = ['CAM_FRONT_LEFT','CAM_FRONT','CAM_FRONT_RIGHT',
+             'CAM_BACK_LEFT','CAM_BACK','CAM_BACK_RIGHT',]
+# we choose these samples not because it is easy but because it is hard
+CANDIDATE=['n008-2018-08-01-15-16-36-0400_1533151184047036',
+           'n008-2018-08-01-15-16-36-0400_1533151200646853',
+           'n008-2018-08-01-15-16-36-0400_1533151274047332',
+           'n008-2018-08-01-15-16-36-0400_1533151369947807',
+           'n008-2018-08-01-15-16-36-0400_1533151581047647',
+           'n008-2018-08-01-15-16-36-0400_1533151585447531',
+           'n008-2018-08-01-15-16-36-0400_1533151741547700',
+           'n008-2018-08-01-15-16-36-0400_1533151854947676',
+           'n008-2018-08-22-15-53-49-0400_1534968048946931',
+           'n008-2018-08-22-15-53-49-0400_1534968255947662',
+           'n008-2018-08-01-15-16-36-0400_1533151616447606',
+           'n015-2018-07-18-11-41-49+0800_1531885617949602',
+           'n008-2018-08-28-16-43-51-0400_1535489136547616',
+           'n008-2018-08-28-16-43-51-0400_1535489145446939',
+           'n008-2018-08-28-16-43-51-0400_1535489152948944',
+           'n008-2018-08-28-16-43-51-0400_1535489299547057',
+           'n008-2018-08-28-16-43-51-0400_1535489317946828',
+           'n008-2018-09-18-15-12-01-0400_1537298038950431',
+           'n008-2018-09-18-15-12-01-0400_1537298047650680',
+           'n008-2018-09-18-15-12-01-0400_1537298056450495',
+           'n008-2018-09-18-15-12-01-0400_1537298074700410',
+           'n008-2018-09-18-15-12-01-0400_1537298088148941',
+           'n008-2018-09-18-15-12-01-0400_1537298101700395',
+           'n015-2018-11-21-19-21-35+0800_1542799330198603',
+           'n015-2018-11-21-19-21-35+0800_1542799345696426',
+           'n015-2018-11-21-19-21-35+0800_1542799353697765',
+           'n015-2018-11-21-19-21-35+0800_1542799525447813',
+           'n015-2018-11-21-19-21-35+0800_1542799676697935',
+           'n015-2018-11-21-19-21-35+0800_1542799758948001',
+           ]
+
+def perspective(cam_coords, proj_mat):
+    pix_coords = proj_mat @ cam_coords
+    valid_idx = pix_coords[2, :] > 0
+    pix_coords = pix_coords[:, valid_idx]
+    pix_coords = pix_coords[:2, :] / (pix_coords[2, :] + 1e-7)
+    pix_coords = pix_coords.transpose(1, 0)
+    return pix_coords
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='vis hdmaptr map gt label')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--score-thresh', default=0.4, type=float, help='samples to visualize')
+    parser.add_argument(
+        '--show-dir', help='directory where visualizations will be saved')
+    parser.add_argument('--show-cam', action='store_true', help='show camera pic')
+    parser.add_argument(
+        '--gt-format',
+        type=str,
+        nargs='+',
+        default=['fixed_num_pts',],
+        help='vis format, default should be "points",'
+        'support ["se_pts","bbox","fixed_num_pts","polyline_pts"]')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    if args.show_dir is None:
+        args.show_dir = osp.join('./work_dirs', 
+                                osp.splitext(osp.basename(args.config))[0],
+                                'vis_pred')
+    # create vis_label dir
+    mmcv.mkdir_or_exist(osp.abspath(args.show_dir))
+    cfg.dump(osp.join(args.show_dir, osp.basename(args.config)))
+    logger = get_root_logger()
+    logger.info(f'DONE create vis_pred dir: {args.show_dir}')
+
+
+    dataset = build_dataset(cfg.data.test)
+    dataset.is_vis_on_test = True #TODO, this is a hack
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        # workers_per_gpu=cfg.data.workers_per_gpu,
+        workers_per_gpu=0,
+        dist=False,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+    logger.info('Done build test data set')
+
+    # build the model and load checkpoint
+    # import pdb;pdb.set_trace()
+    cfg.model.train_cfg = None
+    # cfg.model.pts_bbox_head.bbox_coder.max_num=15 # TODO this is a hack
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    logger.info('loading check point')
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+    logger.info('DONE load check point')
+    model = MMDataParallel(model, device_ids=[0])
+    model.eval()
+
+    img_norm_cfg = cfg.img_norm_cfg
+
+    # get denormalized param
+    mean = np.array(img_norm_cfg['mean'],dtype=np.float32)
+    std = np.array(img_norm_cfg['std'],dtype=np.float32)
+    to_bgr = img_norm_cfg['to_rgb']
+
+    # get pc_range
+    pc_range = cfg.point_cloud_range
+
+    # get car icon
+    with Image.open('./figs/lidar_car.png') as car_img:
+    
+        # get color map: divider->r, ped->b, boundary->g
+        colors_plt = ['orange', 'b', 'g']
+
+
+
+    logger.info('BEGIN vis test dataset samples gt label & pred')
+
+
+
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    have_mask = False
+    # prog_bar = mmcv.ProgressBar(len(CANDIDATE))
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    # import pdb;pdb.set_trace()
+    for i, data in enumerate(data_loader):
+        if ~(data['gt_labels_3d'].data[0][0] != -1).any():
+            # import pdb;pdb.set_trace()
+            logger.error(f'\n empty gt for index {i}, continue')
+            # prog_bar.update()  
+            continue
+       
+        
+        img = data['img'][0].data[0]
+        img_metas = data['img_metas'][0].data[0]
+        gt_bboxes_3d = data['gt_bboxes_3d'].data[0]
+        gt_labels_3d = data['gt_labels_3d'].data[0]
+
+        pts_filename = img_metas[0]['pts_filename']
+        pts_filename = osp.basename(pts_filename)
+        pts_filename = pts_filename.replace('__LIDAR_TOP__', '_').split('.')[0]
+        # import pdb;pdb.set_trace()
+        # if pts_filename not in CANDIDATE:
+        #     continue
+
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+        sample_dir = osp.join(args.show_dir, pts_filename)
+        mmcv.mkdir_or_exist(osp.abspath(sample_dir))
+
+        filename_list = img_metas[0]['filename']
+        img_path_dict = {}
+        # save cam img for sample
+        for filepath in filename_list:
+            filename = osp.basename(filepath)
+            filename_splits = filename.split('__')
+            # sample_dir = filename_splits[0]
+            # sample_dir = osp.join(args.show_dir, sample_dir)
+            # mmcv.mkdir_or_exist(osp.abspath(sample_dir))
+            img_name = filename_splits[1] + '.jpg'
+            img_path = osp.join(sample_dir,img_name)
+            # img_path_list.append(img_path)
+            shutil.copyfile(filepath,img_path)
+            img_path_dict[filename_splits[1]] = img_path
+         
+        # surrounding view
+        row_1_list = []
+        for cam in CAMS[:3]:
+            cam_img_name = cam + '.jpg'
+            cam_img = cv2.imread(osp.join(sample_dir, cam_img_name))
+            row_1_list.append(cam_img)
+        row_2_list = []
+        for cam in CAMS[3:]:
+            cam_img_name = cam + '.jpg'
+            cam_img = cv2.imread(osp.join(sample_dir, cam_img_name))
+            row_2_list.append(cam_img)
+        row_1_img=cv2.hconcat(row_1_list)
+        row_2_img=cv2.hconcat(row_2_list)
+        cams_img = cv2.vconcat([row_1_img,row_2_img])
+        cams_img_path = osp.join(sample_dir,'surroud_view.jpg')
+        cv2.imwrite(cams_img_path, cams_img,[cv2.IMWRITE_JPEG_QUALITY, 70])
+        
+        for vis_format in args.gt_format:
+            if vis_format == 'se_pts':
+                gt_line_points = gt_bboxes_3d[0].start_end_points
+                for gt_bbox_3d, gt_label_3d in zip(gt_line_points, gt_labels_3d[0]):
+                    pts = gt_bbox_3d.reshape(-1,2).numpy()
+                    x = np.array([pt[0] for pt in pts])
+                    y = np.array([pt[1] for pt in pts])
+                    plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d])
+            elif vis_format == 'bbox':
+                gt_lines_bbox = gt_bboxes_3d[0].bbox
+                for gt_bbox_3d, gt_label_3d in zip(gt_lines_bbox, gt_labels_3d[0]):
+                    gt_bbox_3d = gt_bbox_3d.numpy()
+                    xy = (gt_bbox_3d[0],gt_bbox_3d[1])
+                    width = gt_bbox_3d[2] - gt_bbox_3d[0]
+                    height = gt_bbox_3d[3] - gt_bbox_3d[1]
+                    # import pdb;pdb.set_trace()
+                    plt.gca().add_patch(Rectangle(xy,width,height,linewidth=0.4,edgecolor=colors_plt[gt_label_3d],facecolor='none'))
+                    # plt.Rectangle(xy, width, height,color=colors_plt[gt_label_3d])
+                # continue
+            elif vis_format == 'fixed_num_pts':
+                plt.figure(figsize=(2, 4))
+                plt.xlim(pc_range[0], pc_range[3])
+                plt.ylim(pc_range[1], pc_range[4])
+                plt.axis('off')
+                # gt_bboxes_3d[0].fixed_num=30 #TODO, this is a hack
+                gt_lines_fixed_num_pts = gt_bboxes_3d[0].fixed_num_sampled_points
+                for gt_bbox_3d, gt_label_3d in zip(gt_lines_fixed_num_pts, gt_labels_3d[0]):
+                    # import pdb;pdb.set_trace() 
+                    pts = gt_bbox_3d.numpy()
+                    x = np.array([pt[0] for pt in pts])
+                    y = np.array([pt[1] for pt in pts])
+                    # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d])
+
+                    
+                    plt.plot(x, y, color=colors_plt[gt_label_3d],linewidth=1,alpha=0.8,zorder=-1)
+                    plt.scatter(x, y, color=colors_plt[gt_label_3d],s=2,alpha=0.8,zorder=-1)
+                    # plt.plot(x, y, color=colors_plt[gt_label_3d])
+                    # plt.scatter(x, y, color=colors_plt[gt_label_3d],s=1)
+                plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+
+                gt_fixedpts_map_path = osp.join(sample_dir, 'GT_fixednum_pts_MAP.png')
+                plt.savefig(gt_fixedpts_map_path, bbox_inches='tight', format='png',dpi=1200)
+                plt.close()   
+            elif vis_format == 'polyline_pts':
+                plt.figure(figsize=(2, 4))
+                plt.xlim(pc_range[0], pc_range[3])
+                plt.ylim(pc_range[1], pc_range[4])
+                plt.axis('off')
+                gt_lines_instance = gt_bboxes_3d[0].instance_list
+                # import pdb;pdb.set_trace()
+                for gt_line_instance, gt_label_3d in zip(gt_lines_instance, gt_labels_3d[0]):
+                    pts = np.array(list(gt_line_instance.coords))
+                    x = np.array([pt[0] for pt in pts])
+                    y = np.array([pt[1] for pt in pts])
+                    
+                    # plt.quiver(x[:-1], y[:-1], x[1:] - x[:-1], y[1:] - y[:-1], scale_units='xy', angles='xy', scale=1, color=colors_plt[gt_label_3d])
+
+                    # plt.plot(x, y, color=colors_plt[gt_label_3d])
+                    plt.plot(x, y, color=colors_plt[gt_label_3d],linewidth=1,alpha=0.8,zorder=-1)
+                    plt.scatter(x, y, color=colors_plt[gt_label_3d],s=1,alpha=0.8,zorder=-1)
+                plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+
+                gt_polyline_map_path = osp.join(sample_dir, 'GT_polyline_pts_MAP.png')
+                plt.savefig(gt_polyline_map_path, bbox_inches='tight', format='png',dpi=1200)
+                plt.close()           
+
+            else: 
+                logger.error(f'WRONG visformat for GT: {vis_format}')
+                raise ValueError(f'WRONG visformat for GT: {vis_format}')
+
+
+        # import pdb;pdb.set_trace()
+        plt.figure(figsize=(2, 4))
+        plt.xlim(pc_range[0], pc_range[3])
+        plt.ylim(pc_range[1], pc_range[4])
+        plt.axis('off')
+
+        # visualize pred
+        # import pdb;pdb.set_trace()
+        result_dic = result[0]['pts_bbox']
+        boxes_3d = result_dic['boxes_3d'] # bbox: xmin, ymin, xmax, ymax
+        scores_3d = result_dic['scores_3d']
+        labels_3d = result_dic['labels_3d']
+        pts_3d = result_dic['pts_3d']
+        keep = scores_3d > args.score_thresh
+
+        plt.figure(figsize=(2, 4))
+        plt.xlim(pc_range[0], pc_range[3])
+        plt.ylim(pc_range[1], pc_range[4])
+        plt.axis('off')
+        for pred_score_3d, pred_bbox_3d, pred_label_3d, pred_pts_3d in zip(scores_3d[keep], boxes_3d[keep],labels_3d[keep], pts_3d[keep]):
+
+            pred_pts_3d = pred_pts_3d.numpy()
+            pts_x = pred_pts_3d[:,0]
+            pts_y = pred_pts_3d[:,1]
+            plt.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=1,alpha=0.8,zorder=-1)
+            plt.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1)
+
+
+            pred_bbox_3d = pred_bbox_3d.numpy()
+            xy = (pred_bbox_3d[0],pred_bbox_3d[1])
+            width = pred_bbox_3d[2] - pred_bbox_3d[0]
+            height = pred_bbox_3d[3] - pred_bbox_3d[1]
+            pred_score_3d = float(pred_score_3d)
+            pred_score_3d = round(pred_score_3d, 2)
+            s = str(pred_score_3d)
+
+
+
+        plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+
+        map_path = osp.join(sample_dir, 'PRED_MAP_plot.png')
+        plt.savefig(map_path, bbox_inches='tight', format='png',dpi=1200)
+        plt.close()
+
+        
+        prog_bar.update()
+
+    logger.info('\n DONE vis test dataset samples gt label & pred')
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/misc/browse_dataset.py b/model_examples/MapTR/tools/misc/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3419f66df56679088469a842cd62e31906df8a1
--- /dev/null
+++ b/model_examples/MapTR/tools/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+                                     show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['Normalize'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        help='Whether to perform online visualization. Note that you often '
+        'need a monitor to do so.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+    """Build data config for loading visualization data."""
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+    # so we don't need to worry about it later
+    if cfg.data.train['type'] == 'RepeatDataset':
+        cfg.data.train = cfg.data.train.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.data.train['type'] == 'ConcatDataset':
+        cfg.data.train = cfg.data.train.datasets[0]
+    train_data_cfg = cfg.data.train
+    # eval_pipeline purely consists of loading functions
+    # use eval_pipeline for data loading
+    train_data_cfg['pipeline'] = [
+        x for x in cfg.eval_pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def to_depth_mode(points, bboxes):
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and 3D bboxes."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+    if dataset.box_mode_3d != Box3DMode.DEPTH:
+        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+    show_result(
+        points,
+        gt_bboxes.clone(),
+        None,
+        out_dir,
+        filename,
+        show=show,
+        snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and segmentation mask."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_seg = example['pts_semantic_mask']._data.numpy()
+    show_seg_result(
+        points,
+        gt_seg.copy(),
+        None,
+        out_dir,
+        filename,
+        np.array(dataset.PALETTE),
+        dataset.ignore_index,
+        show=show,
+        snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+                       dataset,
+                       out_dir,
+                       filename,
+                       show=False,
+                       is_nus_mono=False):
+    """Visualize 3D bboxes on 2D image by projection."""
+    try:
+        example = dataset.prepare_train_data(idx)
+    except AttributeError:  # for Mono-3D datasets
+        example = dataset.prepare_train_img(idx)
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+    img_metas = example['img_metas']._data
+    img = example['img']._data.numpy()
+    # need to transpose channel to first dim
+    img = img.transpose(1, 2, 0)
+    # no 3D gt bboxes, just show img
+    if gt_bboxes.tensor.shape[0] == 0:
+        gt_bboxes = None
+    if isinstance(gt_bboxes, DepthInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            None,
+            out_dir,
+            filename,
+            box_mode='depth',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['lidar2img'],
+            out_dir,
+            filename,
+            box_mode='lidar',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['cam2img'],
+            out_dir,
+            filename,
+            box_mode='camera',
+            img_metas=img_metas,
+            show=show)
+    else:
+        # can't project, just show img
+        warnings.warn(
+            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+        show_multi_modality_result(
+            img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+    try:
+        dataset = build_dataset(
+            cfg.data.train, default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = build_dataset(cfg.data.train)
+    data_infos = dataset.data_infos
+    dataset_type = cfg.dataset_type
+
+    # configure visualization mode
+    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+    for idx, data_info in enumerate(track_iter_progress(data_infos)):
+        if dataset_type in ['KittiDataset', 'WaymoDataset']:
+            data_path = data_info['point_cloud']['velodyne_path']
+        elif dataset_type in [
+                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+                'S3DISSegDataset', 'S3DISDataset'
+        ]:
+            data_path = data_info['pts_path']
+        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+            data_path = data_info['lidar_path']
+        elif dataset_type in ['NuScenesMonoDataset']:
+            data_path = data_info['file_name']
+        else:
+            raise NotImplementedError(
+                f'unsupported dataset type {dataset_type}')
+
+        file_name = osp.splitext(osp.basename(data_path))[0]
+
+        if vis_task in ['det', 'multi_modality-det']:
+            # show 3D bboxes on 3D point clouds
+            show_det_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+        if vis_task in ['multi_modality-det', 'mono-det']:
+            # project 3D bboxes to 2D image
+            show_proj_bbox_img(
+                idx,
+                dataset,
+                args.output_dir,
+                file_name,
+                show=args.online,
+                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+        elif vis_task in ['seg']:
+            # show 3D segmentation mask on 3D point clouds
+            show_seg_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/misc/fuse_conv_bn.py b/model_examples/MapTR/tools/misc/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e22018d66d3bd47119522e9da2ea6676ba5760
--- /dev/null
+++ b/model_examples/MapTR/tools/misc/fuse_conv_bn.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from mmcv.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network structures."""
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='fuse Conv and BN layers in a model')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file path')
+    parser.add_argument('out', help='output path of the converted model')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/misc/print_config.py b/model_examples/MapTR/tools/misc/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3100fc324b375330ba10316d71405c535d91fb7b
--- /dev/null
+++ b/model_examples/MapTR/tools/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/misc/visualize_results.py b/model_examples/MapTR/tools/misc/visualize_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..302adc50eca960a6660104b33521d438cf54faa0
--- /dev/null
+++ b/model_examples/MapTR/tools/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D visualize the results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--result', help='results file in pickle format')
+    parser.add_argument(
+        '--show-dir', help='directory where visualize results will be saved')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.result is not None and \
+            not args.result.endswith(('.pkl', '.pickle')):
+        raise ValueError('The results file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.data.test.test_mode = True
+
+    # build the dataset
+    dataset = build_dataset(cfg.data.test)
+    results = mmcv.load(args.result)
+
+    if getattr(dataset, 'show', None) is not None:
+        # data loading pipeline for showing
+        eval_pipeline = cfg.get('eval_pipeline', {})
+        if eval_pipeline:
+            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+        else:
+            dataset.show(results, args.show_dir)  # use default pipeline
+    else:
+        raise NotImplementedError(
+            'Show is not implemented for dataset {}!'.format(
+                type(dataset).__name__))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py b/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..33792b00ddd96790acdcdf6ba9d8caf9da39b637
--- /dev/null
+++ b/model_examples/MapTR/tools/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone:
+        config.model.backbone.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+        config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update bbox_head config
+    if 'pred_layer_cfg' not in config.model.bbox_head:
+        config.model.bbox_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.bbox_head:
+        config.model.bbox_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.bbox_head:
+        config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+        config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+        'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'bbox_head.conv_pred.0.bn.num_batches_tracked',
+        'bbox_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'bbox_head.conv_pred.conv_cls.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_cls.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_reg.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'bbox_head.conv_pred.conv_reg.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/model_converters/publish_model.py b/model_examples/MapTR/tools/model_converters/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..318fd46a65894575f5f3e915672b18d24ba133d8
--- /dev/null
+++ b/model_examples/MapTR/tools/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/model_converters/regnet2mmdet.py b/model_examples/MapTR/tools/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dee3c878abc94c1298dcea6856e432a77339665
--- /dev/null
+++ b/model_examples/MapTR/tools/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/test.py b/model_examples/MapTR/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2cf4501870d6a12f4d14a613ebbb86578aa5ac
--- /dev/null
+++ b/model_examples/MapTR/tools/test.py
@@ -0,0 +1,262 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import mmcv
+import os
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet3d.apis import single_gpu_test
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from mmdet3d.models import build_model
+from mmdet.apis import set_random_seed
+from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test
+from mmdet.datasets import replace_ImageToTensor
+import time
+import os.path as osp
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both specified, '
+            '--options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+
+    if not distributed:
+        assert False
+        # model = MMDataParallel(model, device_ids=[0])
+        # outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+                                        args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            assert False
+            #mmcv.dump(outputs['bbox_results'], args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+            '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+            print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/model_examples/MapTR/tools/train.py b/model_examples/MapTR/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1760945cdb3e93d5bbb29cd2787dd74a8f237a67
--- /dev/null
+++ b/model_examples/MapTR/tools/train.py
@@ -0,0 +1,265 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+ 
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import torch_npu #3
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from torch_npu.contrib import transfer_to_npu
+
+torch.npu.config.allow_internal_format = False #3
+torch.npu.set_compile_mode(jit_compile=False) #3
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+            from projects.mmdet3d_plugin.bevformer.apis.train import custom_train_model
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    # if args.resume_from is not None:
+    if args.resume_from is not None and osp.isfile(args.resume_from):
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+        cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    logger.info(f'Model:\n{model}')
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            mmseg_version=mmseg_version,
+            mmdet3d_version=mmdet3d_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()